diff --git a/.copyright.hook b/.copyright.hook
new file mode 100644
index 0000000000000..7cb4721940fd4
--- /dev/null
+++ b/.copyright.hook
@@ -0,0 +1,134 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io
+import re
+import sys
+import os
+import datetime
+
+COPYRIGHT = '''Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.'''
+
+def _generate_copyright(comment_mark):
+    copyright=COPYRIGHT.split(os.linesep)
+    header = copyright[0].rstrip()
+
+    p = re.search('(\d{4})', header).group(0)
+    now = datetime.datetime.now()
+
+    header = header.replace(p,str(now.year))
+
+    ans=[comment_mark + " " + header + os.linesep]
+    for idx, line in enumerate(copyright[1:]):
+        ans.append(comment_mark + " " + line.rstrip() + os.linesep)
+
+    return ans
+
+def _get_comment_mark(path):
+    lang_type=re.compile(r"\.(py|sh)$")
+    if lang_type.search(path) is not None:
+        return "#"
+
+    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    if lang_type.search(path) is not None:
+        return "//"
+
+    return None
+
+
+RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
+RE_COPYRIGHT = re.compile(r".*Copyright( \(c\))* \d{4}", re.IGNORECASE)
+RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
+
+def _check_copyright(path):
+    head=[]
+    try:
+        with open(path) as f:
+            head = [next(f) for x in range(4)]
+    except StopIteration:
+        pass
+
+    for idx, line in enumerate(head):
+        if RE_COPYRIGHT.search(line) is not None:
+            return True
+
+    return False
+
+def generate_copyright(path, comment_mark):
+    original_contents = io.open(path, encoding="utf-8").readlines()
+    head = original_contents[0:4]
+
+    insert_line_no=0
+    for i, line in enumerate(head):
+        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
+            insert_line_no=i+1
+
+    copyright = _generate_copyright(comment_mark)
+    if insert_line_no == 0:
+        new_contents = copyright
+        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents)
+    else:
+        new_contents=original_contents[0:insert_line_no]
+        new_contents.append(os.linesep)
+        new_contents.extend(copyright)
+        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents[insert_line_no:])
+    new_contents="".join(new_contents)
+
+    with io.open(path, 'w') as output_file:
+        output_file.write(new_contents)
+
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for path in args.filenames:
+        comment_mark = _get_comment_mark(path)
+        if comment_mark is None:
+            print("warning:Unsupported file", path, file=sys.stderr)
+            continue
+
+        if _check_copyright(path):
+            continue
+
+        generate_copyright(path, comment_mark)
+
+
+if __name__ == '__main__':
+    exit(main())
\ No newline at end of file
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000000..0833e9852c8f2
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+ignore = E203, E402, E501, E731, E741, W503, W605, E722
+max-line-length = 119
+
+# E402: module level import not at top of file
+per-file-ignores =
+    __init__.py:F401,F403,E402
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b21ccc980216..578b2b16f65ec 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,44 +1,45 @@
--   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
-    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+repos:
+# For Python files
+-   repo: https://github.com/psf/black.git
+    rev: 22.8.0
     hooks:
-    -   id: yapf
-        files: \.py$
+    -   id: black
+        files: \.(py|pyi)$
+        additional_dependencies: [toml]
+-   repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+    -   id: isort
+-   repo: https://github.com/PyCQA/flake8
+    rev: 4.0.1
+    hooks:
+    -   id: flake8
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: a11d9314b22d8f8c7556443875b731ef05965464
+    rev: v4.1.0
     hooks:
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
         files: (?!.*paddle)^.*$
     -   id: end-of-file-fixer
-        files: \.(md|yml)$
+        files: \.md$
     -   id: trailing-whitespace
-        files: \.(md|yml)$
+        files: \.md$
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    sha: v1.0.1
+    rev: v1.1.14
     hooks:
     -   id: forbid-crlf
-        files: \.(md|yml)$
+        files: \.md$
     -   id: remove-crlf
-        files: \.(md|yml)$
+        files: \.md$
     -   id: forbid-tabs
-        files: \.(md|yml)$
+        files: \.md$
     -   id: remove-tabs
-        files: \.(md|yml)$
--   repo: local
-    hooks:
-    -   id: clang-format-with-version-check
-        name: clang-format
-        description: Format files with ClangFormat.
-        entry: bash ./.travis/codestyle/clang_format.hook -i
-        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
-
+        files: \.md$
 -   repo: local
     hooks:
-    -   id: cpplint-cpp-source
-        name: cpplint
-        description: Check C++ code style using cpplint.py.
-        entry: bash ./.travis/codestyle/cpplint_pre_commit.hook
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python .copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
\ No newline at end of file
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
\ No newline at end of file
diff --git a/applications/Automatic_label/automatic_label.py b/applications/Automatic_label/automatic_label.py
index 69947e3f2f7d9..8aa5277232de9 100644
--- a/applications/Automatic_label/automatic_label.py
+++ b/applications/Automatic_label/automatic_label.py
@@ -23,13 +23,16 @@
 import requests
 from paddlenlp.trainer import PdArgumentParser
 from paddlenlp.transformers import AutoTokenizer
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
 from paddlemix.models.groundingdino.modeling import GroundingDinoModel
 from paddlemix.models.sam.modeling import SamModel
 from paddlemix.processors.blip_processing import (
-    Blip2Processor, BlipImageProcessor, BlipTextProcessor)
+    Blip2Processor,
+    BlipImageProcessor,
+    BlipTextProcessor,
+)
 from paddlemix.processors.groundingdino_processing import GroudingDinoProcessor
 from paddlemix.processors.sam_processing import SamProcessor
 from paddlemix.utils.log import logger
@@ -48,9 +51,7 @@ def show_mask(mask, ax, random_color=False):
 def show_box(box, ax, label):
     x0, y0 = box[0], box[1]
     w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(
-        plt.Rectangle(
-            (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
     ax.text(x0, y0, label)
 
 
@@ -79,28 +80,36 @@ class ModelArguments:
 
     blip2_model_name_or_path: str = field(
         default="paddlemix/blip2-caption-opt2.7b",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     text_model_name_or_path: str = field(
         default="facebook/opt-2.7b",
-        metadata={"help": "The type of text model to use (OPT, T5)."}, )
+        metadata={"help": "The type of text model to use (OPT, T5)."},
+    )
     dino_model_name_or_path: str = field(
         default="GroundingDino/groundingdino-swint-ogc",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     sam_model_name_or_path: str = field(
         default="Sam/SamVitH-1024",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     box_threshold: float = field(
         default=0.3,
-        metadata={"help": "box threshold."}, )
+        metadata={"help": "box threshold."},
+    )
     text_threshold: float = field(
         default=0.25,
-        metadata={"help": "text threshold."}, )
+        metadata={"help": "text threshold."},
+    )
     output_dir: str = field(
         default="automatic_label",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def generate_caption(raw_image, prompt, processor, blip2_model):
@@ -110,10 +119,10 @@ def generate_caption(raw_image, prompt, processor, blip2_model):
         text=prompt,
         return_tensors="pd",
         return_attention_mask=True,
-        mode="test", )
+        mode="test",
+    )
     generated_ids, scores = blip2_model.generate(**inputs)
-    generated_text = processor.batch_decode(
-        generated_ids, skip_special_tokens=True)[0].strip()
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     logger.info("Generate text: {}".format(generated_text))
 
     return generated_text
@@ -123,10 +132,7 @@ def generate_tags(caption):
     lemma = nltk.wordnet.WordNetLemmatizer()
 
     nltk.download(["punkt", "averaged_perceptron_tagger", "wordnet"])
-    tags_list = [
-        word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption))
-        if pos[0] == "N"
-    ]
+    tags_list = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) if pos[0] == "N"]
     tags_lemma = [lemma.lemmatize(w) for w in tags_list]
     tags = ", ".join(map(str, tags_lemma))
 
@@ -140,19 +146,17 @@ def main():
 
     logger.info("blip2_model: {}".format(model_args.blip2_model_name_or_path))
     # bulid blip2 processor
-    blip2_tokenizer_class = AutoTokenizer.from_pretrained(
-        model_args.text_model_name_or_path, use_fast=False)
+    blip2_tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False)
     blip2_image_processor = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.blip2_model_name_or_path, "processor", "eval"))
+        os.path.join(model_args.blip2_model_name_or_path, "processor", "eval")
+    )
     blip2_text_processor_class = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.blip2_model_name_or_path, "processor", "eval"))
-    blip2_processor = Blip2Processor(blip2_image_processor,
-                                     blip2_text_processor_class,
-                                     blip2_tokenizer_class)
+        os.path.join(model_args.blip2_model_name_or_path, "processor", "eval")
+    )
+    blip2_processor = Blip2Processor(blip2_image_processor, blip2_text_processor_class, blip2_tokenizer_class)
 
     # #bulid blip2 model
-    blip2_model = Blip2ForConditionalGeneration.from_pretrained(
-        model_args.blip2_model_name_or_path)
+    blip2_model = Blip2ForConditionalGeneration.from_pretrained(model_args.blip2_model_name_or_path)
     paddle.device.cuda.empty_cache()
     blip2_model.eval()
 
@@ -160,21 +164,17 @@ def main():
 
     logger.info("dino_model: {}".format(model_args.dino_model_name_or_path))
     # bulid dino processor
-    dino_processor = GroudingDinoProcessor.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path)
     # bulid dino model
-    dino_model = GroundingDinoModel.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path)
     dino_model.eval()
     logger.info("dino_model build finish!")
 
     # buidl sam processor
-    sam_processor = SamProcessor.from_pretrained(
-        model_args.sam_model_name_or_path)
+    sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path)
     # bulid model
     logger.info("SamModel: {}".format(model_args.sam_model_name_or_path))
-    sam_model = SamModel.from_pretrained(
-        model_args.sam_model_name_or_path, input_type="boxs")
+    sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs")
     logger.info("SamModel build finish!")
 
     # read image
@@ -188,7 +188,8 @@ def main():
         image_pil,
         prompt=data_args.prompt,
         processor=blip2_processor,
-        blip2_model=blip2_model, )
+        blip2_model=blip2_model,
+    )
 
     det_prompt = generate_tags(caption)
     logger.info("det prompt: {}".format(det_prompt))
@@ -196,8 +197,7 @@ def main():
     image_pil = image_pil.convert("RGB")
 
     # preprocess image text_prompt
-    image_tensor, mask, tokenized_out = dino_processor(
-        images=image_pil, text=det_prompt)
+    image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=det_prompt)
 
     with paddle.no_grad():
         outputs = dino_model(
@@ -205,9 +205,9 @@ def main():
             mask,
             input_ids=tokenized_out["input_ids"],
             attention_mask=tokenized_out["attention_mask"],
-            text_self_attention_masks=tokenized_out[
-                "text_self_attention_masks"],
-            position_ids=tokenized_out["position_ids"], )
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
 
     logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
     boxes = outputs["pred_boxes"][0]  # (nq, 4)
@@ -243,8 +243,7 @@ def main():
         x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
         boxes.append([x0, y0, x1, y1])
     boxes = np.array(boxes)
-    image_seg, prompt = sam_processor(
-        image_pil, input_type="boxs", box=boxes, point_coords=None)
+    image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None)
     seg_masks = sam_model(img=image_seg, prompt=prompt)
     seg_masks = sam_processor.postprocess_masks(seg_masks)
 
@@ -267,7 +266,8 @@ def main():
             os.path.join(model_args.output_dir, "mask_pred.jpg"),
             bbox_inches="tight",
             dpi=300,
-            pad_inches=0.0, )
+            pad_inches=0.0,
+        )
 
     logger.info("finish!")
 
diff --git a/applications/CVinW/grounded_sam.py b/applications/CVinW/grounded_sam.py
index 86a326e8947ab..35d251f59ef6f 100644
--- a/applications/CVinW/grounded_sam.py
+++ b/applications/CVinW/grounded_sam.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 
 import os
-import sys
 from dataclasses import dataclass, field
-from typing import List
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -23,7 +21,7 @@
 import paddle.nn.functional as F
 import requests
 from paddlenlp.trainer import PdArgumentParser
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from paddlemix.models.groundingdino.modeling import GroundingDinoModel
 from paddlemix.models.sam.modeling import SamModel
@@ -45,9 +43,7 @@ def show_mask(mask, ax, random_color=False):
 def show_box(box, ax, label):
     x0, y0 = box[0], box[1]
     w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(
-        plt.Rectangle(
-            (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
     ax.text(x0, y0, label)
 
 
@@ -61,8 +57,7 @@ class DataArguments:
     """
 
     input_image: str = field(metadata={"help": "The name of input image."})
-    prompt: str = field(
-        default=None, metadata={"help": "The prompt of the image to be det."})
+    prompt: str = field(default=None, metadata={"help": "The prompt of the image to be det."})
 
 
 @dataclass
@@ -73,22 +68,28 @@ class ModelArguments:
 
     dino_model_name_or_path: str = field(
         default="GroundingDino/groundingdino-swint-ogc",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     sam_model_name_or_path: str = field(
         default="Sam/SamVitH-1024",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     box_threshold: float = field(
         default=0.3,
-        metadata={"help": "box threshold."}, )
+        metadata={"help": "box threshold."},
+    )
     text_threshold: float = field(
         default=0.25,
-        metadata={"help": "text threshold."}, )
+        metadata={"help": "text threshold."},
+    )
     output_dir: str = field(
         default="grounded_sam_output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def main():
@@ -96,32 +97,26 @@ def main():
     model_args, data_args = parser.parse_args_into_dataclasses()
     url = data_args.input_image
     # bulid dino processor
-    dino_processor = GroudingDinoProcessor.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path)
 
     # bulid dino model
     logger.info("dino_model: {}".format(model_args.dino_model_name_or_path))
-    dino_model = GroundingDinoModel.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path)
     dino_model.eval()
     # buidl sam processor
-    sam_processor = SamProcessor.from_pretrained(
-        model_args.sam_model_name_or_path)
+    sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path)
     # bulid model
     logger.info("SamModel: {}".format(model_args.sam_model_name_or_path))
-    sam_model = SamModel.from_pretrained(
-        model_args.sam_model_name_or_path, input_type="boxs")
+    sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs")
 
     # read image
     if os.path.isfile(url):
         # read image
         image_pil = Image.open(url).convert("RGB")
     else:
-        image_pil = Image.open(requests.get(url, stream=True).raw).convert(
-            "RGB")
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     # preprocess image text_prompt
-    image_tensor, mask, tokenized_out = dino_processor(
-        images=image_pil, text=data_args.prompt)
+    image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=data_args.prompt)
 
     with paddle.no_grad():
         outputs = dino_model(
@@ -129,9 +124,9 @@ def main():
             mask,
             input_ids=tokenized_out["input_ids"],
             attention_mask=tokenized_out["attention_mask"],
-            text_self_attention_masks=tokenized_out[
-                "text_self_attention_masks"],
-            position_ids=tokenized_out["position_ids"], )
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
 
     logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
     boxes = outputs["pred_boxes"][0]  # (nq, 4)
@@ -167,8 +162,7 @@ def main():
         x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
         boxes.append([x0, y0, x1, y1])
     boxes = np.array(boxes)
-    image_seg, prompt = sam_processor(
-        image_pil, input_type="boxs", box=boxes, point_coords=None)
+    image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None)
     seg_masks = sam_model(img=image_seg, prompt=prompt)
     seg_masks = sam_processor.postprocess_masks(seg_masks)
 
@@ -190,7 +184,8 @@ def main():
             os.path.join(model_args.output_dir, "mask_pred.jpg"),
             bbox_inches="tight",
             dpi=300,
-            pad_inches=0.0, )
+            pad_inches=0.0,
+        )
 
     logger.info("finish!")
 
diff --git a/applications/Inpainting/grounded_sam_chatglm.py b/applications/Inpainting/grounded_sam_chatglm.py
index a983379e2450b..dd99dc6bc8007 100644
--- a/applications/Inpainting/grounded_sam_chatglm.py
+++ b/applications/Inpainting/grounded_sam_chatglm.py
@@ -22,7 +22,7 @@
 import requests
 from paddlenlp import Taskflow
 from paddlenlp.trainer import PdArgumentParser
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from paddlemix.models.groundingdino.modeling import GroundingDinoModel
 from paddlemix.models.sam.modeling import SamModel
@@ -45,9 +45,7 @@ def show_mask(mask, ax, random_color=False):
 def show_box(box, ax, label):
     x0, y0 = box[0], box[1]
     w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(
-        plt.Rectangle(
-            (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
     ax.text(x0, y0, label)
 
 
@@ -60,11 +58,14 @@ class DataArguments:
     the command line.
     """
 
-    input_image: str = field(metadata={"help": "The name of input image."}, )
+    input_image: str = field(
+        metadata={"help": "The name of input image."},
+    )
 
     prompt: str = field(
         default=None,
-        metadata={"help": "The prompt of the image to be inpaint."}, )
+        metadata={"help": "The prompt of the image to be inpaint."},
+    )
 
 
 @dataclass
@@ -75,36 +76,45 @@ class ModelArguments:
 
     stable_diffusion_pipeline_name_or_path: str = field(
         default="stabilityai/stable-diffusion-2-inpainting",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     dino_model_name_or_path: str = field(
         default="GroundingDino/groundingdino-swint-ogc",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     sam_model_name_or_path: str = field(
         default="Sam/SamVitH-1024",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     chatglm_model_name_or_path: str = field(
         default="THUDM/chatglm-6b",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     box_threshold: float = field(
         default=0.3,
-        metadata={"help": "box threshold."}, )
+        metadata={"help": "box threshold."},
+    )
     text_threshold: float = field(
         default=0.25,
-        metadata={"help": "text threshold."}, )
+        metadata={"help": "text threshold."},
+    )
     output_dir: str = field(
         default="inpainting_output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def filter_prompts_with_chatglm(caption, model_name_or_path="THUDM/chatglm-6b"):
     prompt = (
         "Given caption,extract the main object to be replaced and marked it as 'main_object', "
-        + f"Extract the remaining part as 'other prompt', " +
-        f"Return main_object, other prompt in English" +
-        f"Given caption: {caption}.")
+        + "Extract the remaining part as 'other prompt', "
+        + "Return main_object, other prompt in English"
+        + "Given caption: {}.".format(caption)
+    )
 
     logger.info("chatglm: {}".format(model_name_or_path))
     textGen = Taskflow("text2text_generation", model=model_name_or_path)
@@ -113,7 +123,8 @@ def filter_prompts_with_chatglm(caption, model_name_or_path="THUDM/chatglm-6b"):
 
     det_prompt, inpaint_prompt = (
         reply.split("\n")[0].split(":")[-1].strip(),
-        reply.split("\n")[-1].split(":")[-1].strip(), )
+        reply.split("\n")[-1].split(":")[-1].strip(),
+    )
 
     return det_prompt, inpaint_prompt
 
@@ -125,21 +136,17 @@ def main():
 
     logger.info("dino_model: {}".format(model_args.dino_model_name_or_path))
     # bulid dino processor
-    dino_processor = GroudingDinoProcessor.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path)
     # bulid dino model
-    dino_model = GroundingDinoModel.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path)
     dino_model.eval()
     logger.info("dino_model build finish!")
 
     # buidl sam processor
-    sam_processor = SamProcessor.from_pretrained(
-        model_args.sam_model_name_or_path)
+    sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path)
     # bulid model
     logger.info("SamModel: {}".format(model_args.sam_model_name_or_path))
-    sam_model = SamModel.from_pretrained(
-        model_args.sam_model_name_or_path, input_type="boxs")
+    sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs")
     logger.info("SamModel build finish!")
 
     # read image
@@ -149,16 +156,14 @@ def main():
     else:
         image_pil = Image.open(requests.get(url, stream=True).raw)
 
-    det_prompt, inpaint_prompt = filter_prompts_with_chatglm(
-        data_args.prompt, model_args.chatglm_model_name_or_path)
+    det_prompt, inpaint_prompt = filter_prompts_with_chatglm(data_args.prompt, model_args.chatglm_model_name_or_path)
     logger.info("det prompt: {}".format(det_prompt))
     logger.info("inpaint prompt: {}".format(inpaint_prompt))
 
     image_pil = image_pil.convert("RGB")
 
     # preprocess image text_prompt
-    image_tensor, mask, tokenized_out = dino_processor(
-        images=image_pil, text=det_prompt)
+    image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=det_prompt)
 
     with paddle.no_grad():
         outputs = dino_model(
@@ -166,9 +171,9 @@ def main():
             mask,
             input_ids=tokenized_out["input_ids"],
             attention_mask=tokenized_out["attention_mask"],
-            text_self_attention_masks=tokenized_out[
-                "text_self_attention_masks"],
-            position_ids=tokenized_out["position_ids"], )
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
 
     logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
     boxes = outputs["pred_boxes"][0]  # (nq, 4)
@@ -204,8 +209,7 @@ def main():
         x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
         boxes.append([x0, y0, x1, y1])
     boxes = np.array(boxes)
-    image_seg, prompt = sam_processor(
-        image_pil, input_type="boxs", box=boxes, point_coords=None)
+    image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None)
     seg_masks = sam_model(img=image_seg, prompt=prompt)
     seg_masks = sam_processor.postprocess_masks(seg_masks)
 
@@ -227,12 +231,11 @@ def main():
             os.path.join(model_args.output_dir, "mask_pred.jpg"),
             bbox_inches="tight",
             dpi=300,
-            pad_inches=0.0, )
+            pad_inches=0.0,
+        )
 
-    logger.info("stable diffusion pipeline: {}".format(
-        model_args.stable_diffusion_pipeline_name_or_path))
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        model_args.stable_diffusion_pipeline_name_or_path)
+    logger.info("stable diffusion pipeline: {}".format(model_args.stable_diffusion_pipeline_name_or_path))
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_args.stable_diffusion_pipeline_name_or_path)
     logger.info("stable diffusion pipeline build finish!")
 
     merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0)
@@ -242,11 +245,9 @@ def main():
     image_pil = image_pil.resize((512, 512))
     mask_pil = mask_pil.resize((512, 512))
 
-    image = pipe(
-        prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
+    image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
     image = image.resize(size)
-    image.save(
-        os.path.join(model_args.output_dir, "grounded_sam_chatglm_output.jpg"))
+    image.save(os.path.join(model_args.output_dir, "grounded_sam_chatglm_output.jpg"))
 
     logger.info("finish!")
 
diff --git a/applications/Inpainting/grounded_sam_inpainting.py b/applications/Inpainting/grounded_sam_inpainting.py
index 1fa8aacc4a39c..eccc41359072f 100644
--- a/applications/Inpainting/grounded_sam_inpainting.py
+++ b/applications/Inpainting/grounded_sam_inpainting.py
@@ -21,7 +21,7 @@
 import paddle.nn.functional as F
 import requests
 from paddlenlp.trainer import PdArgumentParser
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from paddlemix.models.groundingdino.modeling import GroundingDinoModel
 from paddlemix.models.sam.modeling import SamModel
@@ -44,9 +44,7 @@ def show_mask(mask, ax, random_color=False):
 def show_box(box, ax, label):
     x0, y0 = box[0], box[1]
     w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(
-        plt.Rectangle(
-            (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
     ax.text(x0, y0, label)
 
 
@@ -59,15 +57,19 @@ class DataArguments:
     the command line.
     """
 
-    input_image: str = field(metadata={"help": "The name of input image."}, )
+    input_image: str = field(
+        metadata={"help": "The name of input image."},
+    )
 
     det_prompt: str = field(
         default=None,
-        metadata={"help": "The prompt of the image to be det."}, )
+        metadata={"help": "The prompt of the image to be det."},
+    )
 
     inpaint_prompt: str = field(
         default=None,
-        metadata={"help": "The prompt of the image to be inpaint."}, )
+        metadata={"help": "The prompt of the image to be inpaint."},
+    )
 
 
 @dataclass
@@ -78,25 +80,32 @@ class ModelArguments:
 
     stable_diffusion_pipeline_name_or_path: str = field(
         default="stabilityai/stable-diffusion-2-inpainting",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     dino_model_name_or_path: str = field(
         default="GroundingDino/groundingdino-swint-ogc",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     sam_model_name_or_path: str = field(
         default="Sam/SamVitH-1024",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     box_threshold: float = field(
         default=0.3,
-        metadata={"help": "box threshold."}, )
+        metadata={"help": "box threshold."},
+    )
     text_threshold: float = field(
         default=0.25,
-        metadata={"help": "text threshold."}, )
+        metadata={"help": "text threshold."},
+    )
     output_dir: str = field(
         default="inpainting_output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def main():
@@ -104,29 +113,23 @@ def main():
     model_args, data_args = parser.parse_args_into_dataclasses()
     url = data_args.input_image
 
-    logger.info("stable diffusion pipeline: {}".format(
-        model_args.stable_diffusion_pipeline_name_or_path))
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        model_args.stable_diffusion_pipeline_name_or_path)
+    logger.info("stable diffusion pipeline: {}".format(model_args.stable_diffusion_pipeline_name_or_path))
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_args.stable_diffusion_pipeline_name_or_path)
     logger.info("stable diffusion pipeline build finish!")
 
     logger.info("dino_model: {}".format(model_args.dino_model_name_or_path))
     # bulid dino processor
-    dino_processor = GroudingDinoProcessor.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path)
     # bulid dino model
-    dino_model = GroundingDinoModel.from_pretrained(
-        model_args.dino_model_name_or_path)
+    dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path)
     dino_model.eval()
     logger.info("dino_model build finish!")
 
     # buidl sam processor
-    sam_processor = SamProcessor.from_pretrained(
-        model_args.sam_model_name_or_path)
+    sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path)
     # bulid model
     logger.info("SamModel: {}".format(model_args.sam_model_name_or_path))
-    sam_model = SamModel.from_pretrained(
-        model_args.sam_model_name_or_path, input_type="boxs")
+    sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs")
     logger.info("SamModel build finish!")
 
     # read image
@@ -142,8 +145,7 @@ def main():
     image_pil = image_pil.convert("RGB")
 
     # preprocess image text_prompt
-    image_tensor, mask, tokenized_out = dino_processor(
-        images=image_pil, text=data_args.det_prompt)
+    image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=data_args.det_prompt)
 
     with paddle.no_grad():
         outputs = dino_model(
@@ -151,9 +153,9 @@ def main():
             mask,
             input_ids=tokenized_out["input_ids"],
             attention_mask=tokenized_out["attention_mask"],
-            text_self_attention_masks=tokenized_out[
-                "text_self_attention_masks"],
-            position_ids=tokenized_out["position_ids"], )
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
 
     logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
     boxes = outputs["pred_boxes"][0]  # (nq, 4)
@@ -189,8 +191,7 @@ def main():
         x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
         boxes.append([x0, y0, x1, y1])
     boxes = np.array(boxes)
-    image_seg, prompt = sam_processor(
-        image_pil, input_type="boxs", box=boxes, point_coords=None)
+    image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None)
     seg_masks = sam_model(img=image_seg, prompt=prompt)
     seg_masks = sam_processor.postprocess_masks(seg_masks)
 
@@ -212,7 +213,8 @@ def main():
             os.path.join(model_args.output_dir, "mask_pred.jpg"),
             bbox_inches="tight",
             dpi=300,
-            pad_inches=0.0, )
+            pad_inches=0.0,
+        )
 
     merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0)
     merge_mask = merge_mask > 0
@@ -221,13 +223,9 @@ def main():
     image_pil = image_pil.resize((512, 512))
     mask_pil = mask_pil.resize((512, 512))
 
-    image = pipe(
-        prompt=data_args.inpaint_prompt, image=image_pil,
-        mask_image=mask_pil).images[0]
+    image = pipe(prompt=data_args.inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
     image = image.resize(size)
-    image.save(
-        os.path.join(model_args.output_dir,
-                     "grounded_sam_inpainting_output.jpg"))
+    image.save(os.path.join(model_args.output_dir, "grounded_sam_inpainting_output.jpg"))
 
     logger.info("finish!")
 
diff --git a/deploy/groundingdino/export.py b/deploy/groundingdino/export.py
index 1e617cd852353..86de4d5085a5c 100644
--- a/deploy/groundingdino/export.py
+++ b/deploy/groundingdino/export.py
@@ -31,12 +31,12 @@ def _prune_input_spec(input_spec, program, targets):
     pruned_input_spec = [{}]
     program = program.clone()
     program = program._prune(targets=targets)
-    global_block = program.global_block()
+    # global_block = program.global_block()
 
     for spec in input_spec:
         try:
             name = spec.name
-            v = global_block.var(name)
+            # v = global_block.var(name)
             pruned_input_spec[0][name] = spec
         except Exception:
             pass
@@ -47,20 +47,12 @@ def _prune_input_spec(input_spec, program, targets):
 def apply_to_static(model):
 
     input_spec = [
-        InputSpec(
-            shape=[None, 3, None, None], name="x", dtype="float32"),
-        InputSpec(
-            shape=[None, None, None], name="m", dtype="int64"),
-        InputSpec(
-            shape=[None, None], name="input_ids", dtype="int64"),
-        InputSpec(
-            shape=[None, None], name="attention_mask", dtype="int64"),
-        InputSpec(
-            shape=[None, None, None],
-            name="text_self_attention_masks",
-            dtype="int64"),
-        InputSpec(
-            shape=[None, None], name="position_ids", dtype="int64"),
+        InputSpec(shape=[None, 3, None, None], name="x", dtype="float32"),
+        InputSpec(shape=[None, None, None], name="m", dtype="int64"),
+        InputSpec(shape=[None, None], name="input_ids", dtype="int64"),
+        InputSpec(shape=[None, None], name="attention_mask", dtype="int64"),
+        InputSpec(shape=[None, None, None], name="text_self_attention_masks", dtype="int64"),
+        InputSpec(shape=[None, None], name="position_ids", dtype="int64"),
     ]
     model = paddle.jit.to_static(model, input_spec=input_spec)
     return model, input_spec
@@ -74,13 +66,15 @@ def apply_to_static(model):
         "-dt",
         type=str,
         default="GroundingDino/groundingdino-swint-ogc",
-        help="dino type", )
+        help="dino type",
+    )
     parser.add_argument(
         "--output_dir",
         "-o",
         type=str,
         default="output_groundingdino",
-        help="output directory", )
+        help="output directory",
+    )
     args = parser.parse_args()
 
     output_dir = args.output_dir
@@ -93,4 +87,5 @@ def apply_to_static(model):
     paddle.jit.save(
         static_model,
         os.path.join(output_dir, "groundingdino_model"),
-        input_spec=input_spec, )
+        input_spec=input_spec,
+    )
diff --git a/deploy/groundingdino/predict.py b/deploy/groundingdino/predict.py
index 836529faab8ed..60c13c20fcf5e 100644
--- a/deploy/groundingdino/predict.py
+++ b/deploy/groundingdino/predict.py
@@ -25,25 +25,26 @@
 from PIL import Image, ImageDraw, ImageFont
 
 from paddlemix.processors.groundingdino_processing import GroudingDinoProcessor
-from paddlemix.utils.log import logger
 
 ms_deformable_attn = load(
     name="deformable_detr_ops",
     sources=[
         "./paddlemix/models/groundingdino/csrc/ms_deformable_attn_op.cc",
         "./paddlemix/models/groundingdino/csrc/ms_deformable_attn_op.cu",
-    ], )
+    ],
+)
 
 
 def load_predictor(
-        model_dir,
-        run_mode="paddle",
-        batch_size=1,
-        device="GPU",
-        cpu_threads=1,
-        enable_mkldnn=False,
-        enable_mkldnn_bfloat16=False,
-        delete_shuffle_pass=False, ):
+    model_dir,
+    run_mode="paddle",
+    batch_size=1,
+    device="GPU",
+    cpu_threads=1,
+    enable_mkldnn=False,
+    enable_mkldnn_bfloat16=False,
+    delete_shuffle_pass=False,
+):
     """set AnalysisConfig, generate AnalysisPredictor
     Args:
         model_dir (str): root path of __model__ and __params__
@@ -64,8 +65,8 @@ def load_predictor(
     """
     if device != "GPU" and run_mode != "paddle":
         raise ValueError(
-            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".
-            format(run_mode, device))
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".format(run_mode, device)
+        )
     infer_model = os.path.join(model_dir, "groundingdino_model.pdmodel")
     infer_params = os.path.join(model_dir, "groundingdino_model.pdiparams")
 
@@ -93,10 +94,8 @@ def load_predictor(
                 config.enable_mkldnn()
                 if enable_mkldnn_bfloat16:
                     config.enable_mkldnn_bfloat16()
-            except Exception as e:
-                print(
-                    "The current environment does not support `mkldnn`, so disable mkldnn."
-                )
+            except Exception:
+                print("The current environment does not support `mkldnn`, so disable mkldnn.")
                 pass
 
     # disable print log when predict
@@ -154,8 +153,7 @@ def plot_boxes_to_image(image_pil, tgt):
 
 class Predictor(object):
     def __init__(self, model_args, data_args):
-        self.processor = GroudingDinoProcessor.from_pretrained(
-            model_args.text_encoder_type)
+        self.processor = GroudingDinoProcessor.from_pretrained(model_args.text_encoder_type)
         self.box_threshold = model_args.box_threshold
         self.text_threshold = model_args.text_threshold
         self.predictor, self.config = load_predictor(model_args.model_path)
@@ -171,8 +169,7 @@ def create_inputs(self):
         self.input_map["m"] = np.array(self.mask.numpy(), dtype="int64")
 
         for key in self.tokenized_input.keys():
-            self.input_map[key] = np.array(
-                self.tokenized_input[key].numpy(), dtype="int64")
+            self.input_map[key] = np.array(self.tokenized_input[key].numpy(), dtype="int64")
 
         input_names = self.predictor.get_input_names()
         for i in range(len(input_names)):
@@ -181,8 +178,7 @@ def create_inputs(self):
 
     def preprocess(self, image, text):
 
-        self.image, self.mask, self.tokenized_input = self.processor(
-            images=image, text=text)
+        self.image, self.mask, self.tokenized_input = self.processor(images=image, text=text)
 
     def run(self, image, prompt):
         self.preprocess(image, data_args.prompt)
@@ -190,10 +186,8 @@ def run(self, image, prompt):
         self.create_inputs()
         self.predictor.run()
         output_names = self.predictor.get_output_names()
-        pred_boxes = self.predictor.get_output_handle(output_names[
-            0]).copy_to_cpu()
-        pred_logits = self.predictor.get_output_handle(output_names[
-            1]).copy_to_cpu()
+        pred_boxes = self.predictor.get_output_handle(output_names[0]).copy_to_cpu()
+        pred_logits = self.predictor.get_output_handle(output_names[1]).copy_to_cpu()
 
         pred_dict = {
             "pred_logits": paddle.to_tensor(pred_logits),
@@ -219,8 +213,7 @@ def postprocess(self, outputs, with_logits=True):
         for logit, box in zip(logits_filt, boxes_filt):
             pred_phrase = self.processor.decode(logit > self.text_threshold)
             if with_logits:
-                pred_phrases.append(pred_phrase +
-                                    f"({str(logit.max().item())[:4]})")
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
             else:
                 pred_phrases.append(pred_phrase)
 
@@ -235,8 +228,7 @@ def main(model_args, data_args):
         # read image
         image_pil = Image.open(data_args.input_image).convert("RGB")
     else:
-        image_pil = Image.open(requests.get(url, stream=True).raw).convert(
-            "RGB")
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
     boxes_filt, pred_phrases = predictor.run(image_pil, data_args.prompt)
 
@@ -265,9 +257,7 @@ class DataArguments:
     """
 
     input_image: str = field(metadata={"help": "The name of input image."})
-    prompt: str = field(
-        default=None,
-        metadata={"help": "The prompt of the image to be generated."})
+    prompt: str = field(default=None, metadata={"help": "The prompt of the image to be generated."})
 
 
 @dataclass
@@ -278,30 +268,32 @@ class ModelArguments:
 
     model_path: str = field(
         default="output_groundingdino/",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     text_encoder_type: str = field(
         default="GroundingDino/groundingdino-swint-ogc",
-        metadata={"help": "type for text encoder ."}, )
+        metadata={"help": "type for text encoder ."},
+    )
     box_threshold: float = field(
         default=0.3,
-        metadata={"help": "box threshold."}, )
+        metadata={"help": "box threshold."},
+    )
     text_threshold: float = field(
         default=0.25,
-        metadata={"help": "text threshold."}, )
+        metadata={"help": "text threshold."},
+    )
     output_dir: str = field(
         default="output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     run_mode: str = field(
         default="paddle",
-        metadata={
-            "help": "mode of running(paddle/trt_fp32/trt_fp16/trt_int8)."
-        }, )
+        metadata={"help": "mode of running(paddle/trt_fp32/trt_fp16/trt_int8)."},
+    )
     device: str = field(
         default="GPU",
-        metadata={
-            "help":
-            "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."
-        }, )
+        metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."},
+    )
 
 
 if __name__ == "__main__":
diff --git a/deploy/sam/export.py b/deploy/sam/export.py
index a2ecf4f879a12..e56c1eb7aaa33 100644
--- a/deploy/sam/export.py
+++ b/deploy/sam/export.py
@@ -14,7 +14,6 @@
 
 import argparse
 import os
-import sys
 
 import paddle
 import yaml
@@ -30,24 +29,28 @@ def parse_args():
         choices=["SamVitL", "SamVitB", "SamVitH"],
         required=True,
         help="The model type.",
-        type=str, )
+        type=str,
+    )
     parser.add_argument(
         "--input_type",
         choices=["boxs", "points", "points_grid"],
         required=True,
         help="The model type.",
-        type=str, )
+        type=str,
+    )
     parser.add_argument(
         "--save_dir",
         help="The directory for saving the exported inference model",
         type=str,
-        default="./output/inference_model", )
+        default="./output/inference_model",
+    )
     parser.add_argument(
         "--input_img_shape",
         nargs="+",
         help="Export the model with fixed input shape, e.g., `--input_img_shape 1 3 512 1024`.",
         type=int,
-        default=[1, 3, 1024, 1024], )
+        default=[1, 3, 1024, 1024],
+    )
 
     return parser.parse_args()
 
@@ -56,11 +59,9 @@ def main(args):
 
     os.environ["PADDLESEG_EXPORT_STAGE"] = "True"
 
-    model = SamModel.from_pretrained(
-        args.model_type, input_type=args.input_type)
+    model = SamModel.from_pretrained(args.model_type, input_type=args.input_type)
 
-    shape = ([None, 3, None, None]
-             if args.input_img_shape is None else args.input_img_shape)
+    shape = [None, 3, None, None] if args.input_img_shape is None else args.input_img_shape
     if args.input_type == "points":
         shape2 = [1, 1, 2]
     elif args.input_type == "boxs":
@@ -69,10 +70,8 @@ def main(args):
         shape2 = [64, 1, 2]
 
     input_spec = [
-        paddle.static.InputSpec(
-            shape=shape, dtype="float32"),
-        paddle.static.InputSpec(
-            shape=shape2, dtype="int32"),
+        paddle.static.InputSpec(shape=shape, dtype="float32"),
+        paddle.static.InputSpec(shape=shape2, dtype="int32"),
     ]
     model.eval()
     model = paddle.jit.to_static(model, input_spec=input_spec)
diff --git a/deploy/sam/predict.py b/deploy/sam/predict.py
index cde1d2b01fe68..00d260a840e65 100644
--- a/deploy/sam/predict.py
+++ b/deploy/sam/predict.py
@@ -19,15 +19,12 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-import paddle
-import paddle.nn.functional as F
 import requests
 import yaml
 from paddle.inference import Config as PredictConfig
-from paddle.inference import PrecisionType, create_predictor
-from paddle.utils.cpp_extension import load
+from paddle.inference import create_predictor
 from paddlenlp.trainer import PdArgumentParser
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from paddlemix.processors.sam_processing import SamProcessor
 from paddlemix.utils.log import logger
@@ -60,9 +57,13 @@ def params(self):
 
 
 def use_auto_tune(args):
-    return (hasattr(PredictConfig, "collect_shape_range_info") and
-            hasattr(PredictConfig, "enable_tuned_tensorrt_dynamic_shape") and
-            args.device == "gpu" and args.use_trt and args.enable_auto_tune)
+    return (
+        hasattr(PredictConfig, "collect_shape_range_info")
+        and hasattr(PredictConfig, "enable_tuned_tensorrt_dynamic_shape")
+        and args.device == "gpu"
+        and args.use_trt
+        and args.enable_auto_tune
+    )
 
 
 def auto_tune(args, imgs, img_nums):
@@ -80,8 +81,8 @@ def auto_tune(args, imgs, img_nums):
     logger.info("Auto tune the dynamic shape for GPU TRT.")
 
     assert use_auto_tune(args), (
-        "Do not support auto_tune, which requires "
-        "device==gpu && use_trt==True && paddle >= 2.2")
+        "Do not support auto_tune, which requires " "device==gpu && use_trt==True && paddle >= 2.2"
+    )
 
     if not isinstance(imgs, (list, tuple)):
         imgs = [imgs]
@@ -114,8 +115,8 @@ def auto_tune(args, imgs, img_nums):
         except Exception as e:
             logger.info(str(e))
             logger.info(
-                "Auto tune failed. Usually, the error is out of GPU memory "
-                "for the model or image is too large. \n")
+                "Auto tune failed. Usually, the error is out of GPU memory " "for the model or image is too large. \n"
+            )
             del predictor
             if os.path.exists(args.auto_tuned_shape_file):
                 os.remove(args.auto_tuned_shape_file)
@@ -153,7 +154,8 @@ def __init__(self, args):
             logger.info(
                 "If the above error is '(InvalidArgument) some trt inputs dynamic shape info not set, "
                 "..., Expected all_dynamic_shape_set == true, ...', "
-                "please set --enable_auto_tune=True to use auto_tune. \n")
+                "please set --enable_auto_tune=True to use auto_tune. \n"
+            )
             exit()
 
     def _init_base_config(self):
@@ -182,12 +184,6 @@ def _init_gpu_config(self):
         """
         logger.info("Use GPU")
         self.pred_cfg.enable_use_gpu(100, 0)
-        precision_map = {
-            "fp16": PrecisionType.Half,
-            "fp32": PrecisionType.Float32,
-            "int8": PrecisionType.Int8,
-        }
-        precision_mode = precision_map[self.args.precision]
 
     def run(self, image, prompt_out):
         image, prompt_out = self.preprocess(image, prompt_out)
@@ -218,7 +214,8 @@ def preprocess(self, image, prompts):
             image,
             input_type=self.args.input_type,
             box=prompts["boxs"],
-            point_coords=prompts["points"], )
+            point_coords=prompts["points"],
+        )
 
         return [image_seg, prompt]
 
@@ -236,11 +233,8 @@ class DataArguments:
     """
 
     input_image: str = field(metadata={"help": "The name of input image."})
-    box_prompt: List[int] = field(
-        default=None, metadata={"help": "box promt format as xyxyxyxy...]."})
-    points_prompt: List[int] = field(
-        default=None,
-        metadata={"help": "point promt format as [[xy],[xy]...]."})
+    box_prompt: List[int] = field(default=None, metadata={"help": "box promt format as xyxyxyxy...]."})
+    points_prompt: List[int] = field(default=None, metadata={"help": "point promt format as [[xy],[xy]...]."})
 
 
 @dataclass
@@ -251,53 +245,56 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="Sam/SamVitH",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     input_type: str = field(
         default="boxs",
-        metadata={
-            "help":
-            "The model prompt type, choices ['boxs', 'points', 'points_grid']."
-        }, )
+        metadata={"help": "The model prompt type, choices ['boxs', 'points', 'points_grid']."},
+    )
     cfg: str = field(
         default=None,
-        metadata={"help": "The config file."}, )
+        metadata={"help": "The config file."},
+    )
     use_trt: bool = field(
         default=False,
-        metadata={
-            "help": "Whether to use Nvidia TensorRT to accelerate prediction."
-        }, )
+        metadata={"help": "Whether to use Nvidia TensorRT to accelerate prediction."},
+    )
     precision: str = field(
         default="fp32",
-        metadata={"help": "The tensorrt precision."}, )
+        metadata={"help": "The tensorrt precision."},
+    )
     min_subgraph_size: int = field(
         default=3,
-        metadata={"help": "The min subgraph size in tensorrt prediction.'"}, )
+        metadata={"help": "The min subgraph size in tensorrt prediction.'"},
+    )
     enable_auto_tune: bool = field(
         default=False,
         metadata={
-            "help":
-            "Whether to enable tuned dynamic shape. We uses some images to collect \
+            "help": "Whether to enable tuned dynamic shape. We uses some images to collect \
              the dynamic shape for trt sub graph, which avoids setting dynamic shape manually."
-        }, )
+        },
+    )
     device: str = field(
         default="GPU",
-        metadata={
-            "help":
-            "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."
-        }, )
+        metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."},
+    )
     cpu_threads: int = field(
         default=10,
-        metadata={"help": "Number of threads to predict when using cpu."}, )
+        metadata={"help": "Number of threads to predict when using cpu."},
+    )
     enable_mkldnn: bool = field(
         default=False,
-        metadata={"help": "Enable to use mkldnn to speed up when using cpu."}, )
+        metadata={"help": "Enable to use mkldnn to speed up when using cpu."},
+    )
 
     output_dir: str = field(
         default="seg_output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def main(model_args, data_args):
@@ -308,8 +305,7 @@ def main(model_args, data_args):
         # read image
         image_pil = Image.open(data_args.input_image).convert("RGB")
     else:
-        image_pil = Image.open(requests.get(url, stream=True).raw).convert(
-            "RGB")
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
     if data_args.box_prompt is not None:
         data_args.box_prompt = np.array(data_args.box_prompt)
@@ -323,10 +319,7 @@ def main(model_args, data_args):
     predictor = Predictor(model_args)
 
     image_pil = Image.open(data_args.input_image).convert("RGB")
-    seg_masks = predictor.run(image_pil, {
-        "points": data_args.points_prompt,
-        "boxs": data_args.box_prompt
-    })
+    seg_masks = predictor.run(image_pil, {"points": data_args.points_prompt, "boxs": data_args.box_prompt})
 
     if model_args.visual:
         # make dir
@@ -342,10 +335,10 @@ def main(model_args, data_args):
             os.path.join(model_args.output_dir, "mask_pred.jpg"),
             bbox_inches="tight",
             dpi=300,
-            pad_inches=0.0, )
+            pad_inches=0.0,
+        )
 
-    if use_auto_tune(model_args) and os.path.exists(
-            model_args.auto_tuned_shape_file):
+    if use_auto_tune(model_args) and os.path.exists(model_args.auto_tuned_shape_file):
         os.remove(model_args.auto_tuned_shape_file)
 
 
diff --git a/paddlemix/activations.py b/paddlemix/activations.py
index c3119b2315377..ab9be11679283 100644
--- a/paddlemix/activations.py
+++ b/paddlemix/activations.py
@@ -28,9 +28,9 @@ class NewGELUActivation(nn.Layer):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return (0.5 * input * (1.0 + paddle.tanh(
-            math.sqrt(2.0 / math.pi) *
-            (input + 0.044715 * paddle.pow(input, 3.0)))))
+        return (
+            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+        )
 
 
 class GELUActivation(nn.Layer):
@@ -41,7 +41,7 @@ class GELUActivation(nn.Layer):
     Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
 
-    def __init__(self, use_gelu_python: bool=False):
+    def __init__(self, use_gelu_python: bool = False):
         super().__init__()
         if use_gelu_python:
             self.act = self._gelu_python
@@ -61,9 +61,7 @@ class FastGELUActivation(nn.Layer):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return (0.5 * input *
-                (1.0 + paddle.tanh(input * 0.7978845608 *
-                                   (1.0 + 0.044715 * input * input))))
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
 
 
 class QuickGELUActivation(nn.Layer):
@@ -90,8 +88,7 @@ class ClippedGELUActivation(nn.Layer):
 
     def __init__(self, min: float, max: float):
         if min > max:
-            raise ValueError(
-                f"min should be < max (got min: {min}, max: {max})")
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
 
         super().__init__()
         self.min = min
@@ -142,15 +139,10 @@ def __getitem__(self, key):
 
 ACT2CLS = {
     "gelu": GELUActivation,
-    "gelu_10": (ClippedGELUActivation, {
-        "min": -10,
-        "max": 10
-    }),
+    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
     "gelu_fast": FastGELUActivation,
     "gelu_new": NewGELUActivation,
-    "gelu_python": (GELUActivation, {
-        "use_gelu_python": True
-    }),
+    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
     "linear": LinearActivation,
     "mish": MishActivation,
     "quick_gelu": QuickGELUActivation,
@@ -168,9 +160,7 @@ def get_activation(activation_string):
     if activation_string in ACT2FN:
         return ACT2FN[activation_string]
     else:
-        raise KeyError(
-            f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
-        )
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
 
 
 # For backwards compatibility with: from activations import gelu_python
diff --git a/paddlemix/appflow/appflow.py b/paddlemix/appflow/appflow.py
index 4823fee7482bb..d26e362965302 100644
--- a/paddlemix/appflow/appflow.py
+++ b/paddlemix/appflow/appflow.py
@@ -32,16 +32,8 @@ class Appflow(object):
 
     """
 
-    def __init__(self,
-                 app,
-                 models=None,
-                 mode=None,
-                 device_id=0,
-                 from_hf_hub=False,
-                 **kwargs):
-        assert (
-            app in APPLICATIONS
-        ), f"The task name:{app} is not in Taskflow list, please check your task name."
+    def __init__(self, app, models=None, mode=None, device_id=0, from_hf_hub=False, **kwargs):
+        assert app in APPLICATIONS, f"The task name:{app} is not in Taskflow list, please check your task name."
         self.app = app
         # Set the device for the task
         device = get_env_device()
@@ -55,16 +47,14 @@ def __init__(self,
         self.models = models
         if isinstance(self.models, list) and len(self.models) > 0:
             for model in self.models:
-                assert model in set(APPLICATIONS[app][tag].keys(
-                )), f"The {tag} name: {model} is not in task:[{app}]"
+                assert model in set(APPLICATIONS[app][tag].keys()), f"The {tag} name: {model} is not in task:[{app}]"
         else:
             self.models = [APPLICATIONS[app]["default"][ind_tag]]
 
         self.task_instances = []
         for model in self.models:
             if "task_priority_path" in APPLICATIONS[self.app][tag][model]:
-                priority_path = APPLICATIONS[self.app][tag][model][
-                    "task_priority_path"]
+                priority_path = APPLICATIONS[self.app][tag][model]["task_priority_path"]
             else:
                 priority_path = None
 
@@ -79,7 +69,9 @@ def __init__(self,
                     task=self.app,
                     priority_path=priority_path,
                     from_hf_hub=from_hf_hub,
-                    **kwargs, ))
+                    **kwargs,
+                )
+            )
 
         app_list = APPLICATIONS.keys()
         Appflow.app_list = app_list
diff --git a/paddlemix/appflow/apptask.py b/paddlemix/appflow/apptask.py
index 7b7bdc6055650..694204c312c22 100644
--- a/paddlemix/appflow/apptask.py
+++ b/paddlemix/appflow/apptask.py
@@ -42,28 +42,22 @@ def __init__(self, model, task, priority_path=None, **kwargs):
         self._priority_path = priority_path
         self.is_static_model = kwargs.get("is_static_model", False)
 
-        self._home_path = (self.kwargs["home_path"]
-                           if "home_path" in self.kwargs else PPMIX_HOME)
+        self._home_path = self.kwargs["home_path"] if "home_path" in self.kwargs else PPMIX_HOME
 
         if "task_path" in self.kwargs:
             self._task_path = self.kwargs["task_path"]
             self._model_dir = self._task_path
         elif self._priority_path:
-            self._task_path = os.path.join(self._home_path, "models",
-                                           self._priority_path)
+            self._task_path = os.path.join(self._home_path, "models", self._priority_path)
             self._model_dir = os.path.join(self._home_path, "models")
         else:
-            self._task_path = os.path.join(self._home_path, "models",
-                                           self.model)
+            self._task_path = os.path.join(self._home_path, "models", self.model)
             self._model_dir = os.path.join(self._home_path, "models")
 
-        self._infer_precision = (self.kwargs["precision"]
-                                 if "precision" in self.kwargs else "fp32")
+        self._infer_precision = self.kwargs["precision"] if "precision" in self.kwargs else "fp32"
         # Default to use Paddle Inference
         self._predictor_type = "paddle-inference"
-        self._num_threads = (self.kwargs["num_threads"]
-                             if "num_threads" in self.kwargs else
-                             math.ceil(cpu_count() / 2))
+        self._num_threads = self.kwargs["num_threads"] if "num_threads" in self.kwargs else math.ceil(cpu_count() / 2)
 
     def _construct_tokenizer(self, model):
         """
@@ -83,8 +77,7 @@ def _get_static_model_name(self):
         if len(names) == 0:
             raise IOError(f"{self._task_path} should include '.pdparams' file.")
         if len(names) > 1:
-            logger.warning(
-                f"{self._task_path} includes more than one '.pdparams' file.")
+            logger.warning(f"{self._task_path} includes more than one '.pdparams' file.")
         return names[0]
 
     def _convert_dygraph_to_static(self):
@@ -98,12 +91,10 @@ def _convert_dygraph_to_static(self):
             self._input_spec is not None
         ), "The input spec must be created before converting the dygraph model to static model."
         logger.info("Converting to the inference model cost a little time.")
-        static_model = paddle.jit.to_static(
-            self._model, input_spec=self._input_spec)
+        static_model = paddle.jit.to_static(self._model, input_spec=self._input_spec)
 
         paddle.jit.save(static_model, self.inference_model_path)
-        logger.info("The inference model save in the path:{}".format(
-            self.inference_model_path))
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
 
     def _prepare_static_mode(self):
         """
@@ -139,50 +130,46 @@ def _prepare_static_mode(self):
                     min_subgraph_size=30,
                     precision_mode=precision_map[self._infer_precision],
                     use_static=True,
-                    use_calib_mode=False, )
+                    use_calib_mode=False,
+                )
 
                 if not os.path.exists(self._tuned_trt_shape_file):
-                    self._config.collect_shape_range_info(
-                        self._tuned_trt_shape_file)
+                    self._config.collect_shape_range_info(self._tuned_trt_shape_file)
                 else:
-                    logger.info(f"Use dynamic shape file: "
-                                f"{self._tuned_trt_shape_file} for TRT...")
-                self._config.enable_tuned_tensorrt_dynamic_shape(
-                    self._tuned_trt_shape_file, True)
+                    logger.info(f"Use dynamic shape file: " f"{self._tuned_trt_shape_file} for TRT...")
+                self._config.enable_tuned_tensorrt_dynamic_shape(self._tuned_trt_shape_file, True)
 
             if self.task == "openset_det_sam":
                 self._config.delete_pass("add_support_int8_pass")
 
                 if self.model == "GroundingDino/groundingdino-swint-ogc":
-                    self._config.exp_disable_tensorrt_ops([
-                        "pad3d",
-                        "set_value",
-                        "reduce_all",
-                        "cumsum_8.tmp_0",
-                        "linear_296.tmp_1",
-                    ])
+                    self._config.exp_disable_tensorrt_ops(
+                        [
+                            "pad3d",
+                            "set_value",
+                            "reduce_all",
+                            "cumsum_8.tmp_0",
+                            "linear_296.tmp_1",
+                        ]
+                    )
 
                 if self.model == "Sam/SamVitH-1024" or self.model == "Sam/SamVitH-512":
                     self._config.delete_pass("shuffle_channel_detect_pass")
                     self._config.delete_pass("trt_skip_layernorm_fuse_pass")
                     self._config.delete_pass("preln_residual_bias_fuse_pass")
-                    self._config.exp_disable_tensorrt_ops([
-                        "concat_1.tmp_0",
-                        "set_value",
-                        "empty_0.tmp_0",
-                        "concat_55.tmp_0",
-                    ])
+                    self._config.exp_disable_tensorrt_ops(
+                        [
+                            "concat_1.tmp_0",
+                            "set_value",
+                            "empty_0.tmp_0",
+                            "concat_55.tmp_0",
+                        ]
+                    )
 
         self.predictor = paddle.inference.create_predictor(self._config)
         self.input_names = [name for name in self.predictor.get_input_names()]
-        self.input_handles = [
-            self.predictor.get_input_handle(name)
-            for name in self.predictor.get_input_names()
-        ]
-        self.output_handle = [
-            self.predictor.get_output_handle(name)
-            for name in self.predictor.get_output_names()
-        ]
+        self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
+        self.output_handle = [self.predictor.get_output_handle(name) for name in self.predictor.get_output_names()]
 
     def _get_inference_model(self):
         """
@@ -191,11 +178,10 @@ def _get_inference_model(self):
 
         # When the user-provided model path is already a static model, skip to_static conversion
         if self.is_static_model:
-            self.inference_model_path = os.path.join(self._task_path,
-                                                     self._static_model_name)
-            if not os.path.exists(self.inference_model_path +
-                                  ".pdmodel") or not os.path.exists(
-                                      self.inference_model_path + ".pdiparams"):
+            self.inference_model_path = os.path.join(self._task_path, self._static_model_name)
+            if not os.path.exists(self.inference_model_path + ".pdmodel") or not os.path.exists(
+                self.inference_model_path + ".pdiparams"
+            ):
                 raise IOError(
                     f"{self._task_path} should include {self._static_model_name + '.pdmodel'} and {self._static_model_name + '.pdiparams'} while is_static_model is True"
                 )
@@ -205,8 +191,7 @@ def _get_inference_model(self):
 
         else:
             # Since 'self._task_path' is used to load the HF Hub path when 'from_hf_hub=True', we construct the static model path in a different way
-            self.inference_model_path = os.path.join(self._task_path,
-                                                     self._static_model_name)
+            self.inference_model_path = os.path.join(self._task_path, self._static_model_name)
             self._tuned_trt_shape_file = self.inference_model_path + "_shape.txt"
             if not os.path.exists(self.inference_model_path + ".pdiparams"):
                 with dygraph_mode_guard():
@@ -217,17 +202,12 @@ def _get_inference_model(self):
         self._static_model_file = self.inference_model_path + ".pdmodel"
         self._static_params_file = self.inference_model_path + ".pdiparams"
 
-        if (paddle.get_device().split(":", 1)[0] == "npu" and
-                self._infer_precision == "fp16"):
+        if paddle.get_device().split(":", 1)[0] == "npu" and self._infer_precision == "fp16":
             # transform fp32 model tp fp16 model
             self._static_fp16_model_file = self.inference_model_path + "-fp16.pdmodel"
-            self._static_fp16_params_file = (
-                self.inference_model_path + "-fp16.pdiparams")
-            if not os.path.exists(
-                    self._static_fp16_model_file) and not os.path.exists(
-                        self._static_fp16_params_file):
-                logger.info(
-                    "Converting to the inference model from fp32 to fp16.")
+            self._static_fp16_params_file = self.inference_model_path + "-fp16.pdiparams"
+            if not os.path.exists(self._static_fp16_model_file) and not os.path.exists(self._static_fp16_params_file):
+                logger.info("Converting to the inference model from fp32 to fp16.")
                 paddle.inference.convert_to_mixed_precision(
                     os.path.join(self._static_model_file),
                     os.path.join(self._static_params_file),
@@ -237,16 +217,16 @@ def _get_inference_model(self):
                     mixed_precision=paddle.inference.PrecisionType.Half,
                     # Here, npu sigmoid will lead to OOM and cpu sigmoid don't support fp16.
                     # So, we add sigmoid to black list temporarily.
-                    black_list={"sigmoid"}, )
+                    black_list={"sigmoid"},
+                )
                 logger.info(
-                    "The inference model in fp16 precison save in the path:{}".
-                    format(self._static_fp16_model_file))
+                    "The inference model in fp16 precison save in the path:{}".format(self._static_fp16_model_file)
+                )
             self._static_model_file = self._static_fp16_model_file
             self._static_params_file = self._static_fp16_params_file
 
         if self._predictor_type == "paddle-inference":
-            self._config = paddle.inference.Config(self._static_model_file,
-                                                   self._static_params_file)
+            self._config = paddle.inference.Config(self._static_model_file, self._static_params_file)
             self._prepare_static_mode()
         else:
             self._prepare_onnx_mode()
diff --git a/paddlemix/appflow/configuration.py b/paddlemix/appflow/configuration.py
index 174729a2d8a19..bf2bd400d5b32 100644
--- a/paddlemix/appflow/configuration.py
+++ b/paddlemix/appflow/configuration.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .image2image_text_guided_generation import (StableDiffusionImg2ImgTask,
-                                                 StableDiffusionUpscaleTask)
+from .image2image_text_guided_generation import (
+    StableDiffusionImg2ImgTask,
+    StableDiffusionUpscaleTask,
+)
 from .image2text_generation import Blip2CaptionTask
 from .openset_det_sam import OpenSetDetTask, OpenSetSegTask
-from .text2image_generation import (StableDiffusionTask,
-                                    VersatileDiffusionDualGuidedTask)
+from .text2image_generation import StableDiffusionTask, VersatileDiffusionDualGuidedTask
 from .text2image_inpaiting import StableDiffusionInpaintTask
 from .text2text_generation import ChatGlmTask
 from .text2video_generation import TextToVideoSDTask
@@ -92,8 +93,7 @@
         "models": {
             "Linaqruf/anything-v3.0": {
                 "task_class": StableDiffusionImg2ImgTask,
-                "task_flag":
-                "image2image_text_guided_generation-Linaqruf/anything-v3.0",
+                "task_flag": "image2image_text_guided_generation-Linaqruf/anything-v3.0",
             }
         },
         "default": {
@@ -104,8 +104,7 @@
         "models": {
             "stabilityai/stable-diffusion-x4-upscaler": {
                 "task_class": StableDiffusionUpscaleTask,
-                "task_flag":
-                "image2image_text_guided_upscaling-stabilityai/stable-diffusion-x4-upscaler",
+                "task_flag": "image2image_text_guided_upscaling-stabilityai/stable-diffusion-x4-upscaler",
             }
         },
         "default": {
@@ -116,8 +115,7 @@
         "models": {
             "shi-labs/versatile-diffusion": {
                 "task_class": VersatileDiffusionDualGuidedTask,
-                "task_flag":
-                "dual_text_and_image_guided_generation-shi-labs/versatile-diffusion",
+                "task_flag": "dual_text_and_image_guided_generation-shi-labs/versatile-diffusion",
             }
         },
         "default": {
@@ -128,8 +126,7 @@
         "models": {
             "damo-vilab/text-to-video-ms-1.7b": {
                 "task_class": TextToVideoSDTask,
-                "task_flag":
-                "text_to_video_generation-damo-vilab/text-to-video-ms-1.7b",
+                "task_flag": "text_to_video_generation-damo-vilab/text-to-video-ms-1.7b",
             }
         },
         "default": {
diff --git a/paddlemix/appflow/image2image_text_guided_generation.py b/paddlemix/appflow/image2image_text_guided_generation.py
index b58904d14ba7c..2894d7cbbafc1 100644
--- a/paddlemix/appflow/image2image_text_guided_generation.py
+++ b/paddlemix/appflow/image2image_text_guided_generation.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-
-from ppdiffusers import (StableDiffusionImg2ImgPipeline,
-                         StableDiffusionUpscalePipeline)
+from ppdiffusers import StableDiffusionImg2ImgPipeline, StableDiffusionUpscalePipeline
 
 from .apptask import AppTask
 
@@ -37,19 +34,18 @@ def _construct_model(self, model):
         """
 
         # bulid model
-        model_instance = StableDiffusionImg2ImgPipeline.from_pretrained(
-            model, safety_checker=None)
+        model_instance = StableDiffusionImg2ImgPipeline.from_pretrained(model, safety_checker=None)
 
         self._model = model_instance
 
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
         negative_prompt = inputs.get("negative_prompt", None)
-        assert negative_prompt is not None, f"The negative_prompt is None"
+        assert negative_prompt is not None, "The negative_prompt is None"
 
         return inputs
 
@@ -63,7 +59,8 @@ def _run_model(self, inputs):
             negative_prompt=inputs["negative_prompt"],
             image=inputs["image"],
             guidance_scale=self._guidance_scale,
-            strength=self._strength, ).images[0]
+            strength=self._strength,
+        ).images[0]
 
         inputs.pop("prompt", None)
         inputs.pop("negative_prompt", None)
@@ -101,9 +98,9 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
 
         return inputs
 
@@ -114,7 +111,8 @@ def _run_model(self, inputs):
 
         result = self._model(
             prompt=inputs["prompt"],
-            image=inputs["image"], ).images[0]
+            image=inputs["image"],
+        ).images[0]
 
         inputs.pop("prompt", None)
         inputs.pop("image", None)
diff --git a/paddlemix/appflow/image2text_generation.py b/paddlemix/appflow/image2text_generation.py
index 8e13a811cac52..1202560ee8fc5 100644
--- a/paddlemix/appflow/image2text_generation.py
+++ b/paddlemix/appflow/image2text_generation.py
@@ -19,7 +19,10 @@
 
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
 from paddlemix.processors.blip_processing import (
-    Blip2Processor, BlipImageProcessor, BlipTextProcessor)
+    Blip2Processor,
+    BlipImageProcessor,
+    BlipTextProcessor,
+)
 from paddlemix.utils.log import logger
 
 from .apptask import AppTask
@@ -41,23 +44,18 @@ def _construct_processor(self, model):
         Construct the tokenizer for the predictor.
         """
         # bulid processor
-        tokenizer_class = AutoTokenizer.from_pretrained(
-            self._text_model, use_fast=False)
-        image_processor = BlipImageProcessor.from_pretrained(
-            os.path.join(model, "processor", "eval"))
-        text_processor_class = BlipTextProcessor.from_pretrained(
-            os.path.join(model, "processor", "eval"))
+        tokenizer_class = AutoTokenizer.from_pretrained(self._text_model, use_fast=False)
+        image_processor = BlipImageProcessor.from_pretrained(os.path.join(model, "processor", "eval"))
+        text_processor_class = BlipTextProcessor.from_pretrained(os.path.join(model, "processor", "eval"))
 
-        self._processor = Blip2Processor(image_processor, text_processor_class,
-                                         tokenizer_class)
+        self._processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class)
 
     def _construct_model(self, model):
         """
         Construct the inference model for the predictor.
         """
         # bulid model
-        model_instance = Blip2ForConditionalGeneration.from_pretrained(
-            model, cache_dir=self._model_dir)
+        model_instance = Blip2ForConditionalGeneration.from_pretrained(model, cache_dir=self._model_dir)
 
         self._model = model_instance
         self._model.eval()
@@ -65,7 +63,7 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
 
         prompt = "describe the image"
 
@@ -74,7 +72,8 @@ def _preprocess(self, inputs):
             text=prompt,
             return_tensors="pd",
             return_attention_mask=True,
-            mode="test", )
+            mode="test",
+        )
 
         inputs["blip2_input"] = blip2_input
 
@@ -97,8 +96,7 @@ def _postprocess(self, inputs):
         """
         The model output is tag ids, this function will convert the model output to raw text.
         """
-        generated_text = self._processor.batch_decode(
-            inputs["result"], skip_special_tokens=True)[0].strip()
+        generated_text = self._processor.batch_decode(inputs["result"], skip_special_tokens=True)[0].strip()
         logger.info("Generate text: {}".format(generated_text))
 
         inputs.pop("result", None)
@@ -111,10 +109,7 @@ def _generate_tags(self, caption):
         lemma = nltk.wordnet.WordNetLemmatizer()
 
         nltk.download(["punkt", "averaged_perceptron_tagger", "wordnet"])
-        tags_list = [
-            word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption))
-            if pos[0] == "N"
-        ]
+        tags_list = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) if pos[0] == "N"]
         tags_lemma = [lemma.lemmatize(w) for w in tags_list]
         tags = ", ".join(map(str, tags_lemma))
 
diff --git a/paddlemix/appflow/openset_det_sam.py b/paddlemix/appflow/openset_det_sam.py
index ee2b499856645..187c9ece0a56e 100644
--- a/paddlemix/appflow/openset_det_sam.py
+++ b/paddlemix/appflow/openset_det_sam.py
@@ -14,13 +14,11 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.taskflow.utils import static_mode_guard
 
 from paddlemix.models.groundingdino.modeling import GroundingDinoModel
 from paddlemix.models.sam.modeling import SamModel
 from paddlemix.processors.groundingdino_processing import GroudingDinoProcessor
 from paddlemix.processors.sam_processing import SamProcessor
-from paddlemix.utils.log import logger
 
 from .apptask import AppTask
 
@@ -57,21 +55,16 @@ def _construct_input_spec(self):
         Construct the input spec for the convert dygraph model to static model.
         """
         self._input_spec = [
-            paddle.static.InputSpec(
-                shape=[None, 3, None, None], name="x",
-                dtype="float32"),  # image features
-            paddle.static.InputSpec(
-                shape=[None, None, None], name="m", dtype="int64"),  # mask
-            paddle.static.InputSpec(
-                shape=[None, None], name="input_ids", dtype="int64"),
-            paddle.static.InputSpec(
-                shape=[None, None], name="attention_mask", dtype="int64"),
+            paddle.static.InputSpec(shape=[None, 3, None, None], name="x", dtype="float32"),  # image features
+            paddle.static.InputSpec(shape=[None, None, None], name="m", dtype="int64"),  # mask
+            paddle.static.InputSpec(shape=[None, None], name="input_ids", dtype="int64"),
+            paddle.static.InputSpec(shape=[None, None], name="attention_mask", dtype="int64"),
             paddle.static.InputSpec(
                 shape=[None, None, None],
                 name="text_self_attention_masks",
-                dtype="int64", ),
-            paddle.static.InputSpec(
-                shape=[None, None], name="position_ids", dtype="int64"),
+                dtype="int64",
+            ),
+            paddle.static.InputSpec(shape=[None, None], name="position_ids", dtype="int64"),
         ]
 
     def _construct_processor(self, model):
@@ -79,8 +72,7 @@ def _construct_processor(self, model):
         Construct the tokenizer for the predictor.
         """
         # bulid processor
-        self._processor = GroudingDinoProcessor.from_pretrained(
-            model, cache_dir=self._model_dir)
+        self._processor = GroudingDinoProcessor.from_pretrained(model, cache_dir=self._model_dir)
 
     def _construct_model(self, model):
         """
@@ -88,8 +80,7 @@ def _construct_model(self, model):
         """
 
         # bulid model
-        model_instance = GroundingDinoModel.from_pretrained(
-            model, cache_dir=self._model_dir)
+        model_instance = GroundingDinoModel.from_pretrained(model, cache_dir=self._model_dir)
 
         # Load the model parameter for the predict
         model_instance.eval()
@@ -98,13 +89,12 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
 
         self._size = image.size
-        image_tensor, mask, tokenized_out = self._processor(
-            images=image, text=prompt)
+        image_tensor, mask, tokenized_out = self._processor(images=image, text=prompt)
 
         inputs["image_tensor"] = image_tensor
         inputs["mask"] = mask
@@ -121,16 +111,18 @@ def _run_model(self, inputs):
 
             inputs["mask"] = paddle.cast(inputs["mask"], dtype="int64")
             inputs["tokenized_out"]["text_self_attention_masks"] = paddle.cast(
-                inputs["tokenized_out"]["text_self_attention_masks"],
-                dtype="int64")
-            [pred_boxes, pred_logits] = self.predictor.run([
-                inputs["image_tensor"],
-                inputs["mask"],
-                inputs["tokenized_out"]["input_ids"],
-                inputs["tokenized_out"]["attention_mask"],
-                inputs["tokenized_out"]["text_self_attention_masks"],
-                inputs["tokenized_out"]["position_ids"],
-            ])
+                inputs["tokenized_out"]["text_self_attention_masks"], dtype="int64"
+            )
+            [pred_boxes, pred_logits] = self.predictor.run(
+                [
+                    inputs["image_tensor"],
+                    inputs["mask"],
+                    inputs["tokenized_out"]["input_ids"],
+                    inputs["tokenized_out"]["attention_mask"],
+                    inputs["tokenized_out"]["text_self_attention_masks"],
+                    inputs["tokenized_out"]["position_ids"],
+                ]
+            )
             result = {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
         else:
             result = self._model(
@@ -138,9 +130,9 @@ def _run_model(self, inputs):
                 inputs["mask"],
                 input_ids=inputs["tokenized_out"]["input_ids"],
                 attention_mask=inputs["tokenized_out"]["attention_mask"],
-                text_self_attention_masks=inputs["tokenized_out"][
-                    "text_self_attention_masks"],
-                position_ids=inputs["tokenized_out"]["position_ids"], )
+                text_self_attention_masks=inputs["tokenized_out"]["text_self_attention_masks"],
+                position_ids=inputs["tokenized_out"]["position_ids"],
+            )
         inputs.pop("image_tensor", None)
         inputs.pop("mask", None)
         inputs.pop("tokenized_out", None)
@@ -155,10 +147,8 @@ def _postprocess(self, inputs):
         """
 
         if self._static_mode:
-            inputs["result"]["pred_logits"] = paddle.to_tensor(inputs["result"][
-                "pred_logits"])
-            inputs["result"]["pred_boxes"] = paddle.to_tensor(inputs["result"][
-                "pred_boxes"])
+            inputs["result"]["pred_logits"] = paddle.to_tensor(inputs["result"]["pred_logits"])
+            inputs["result"]["pred_boxes"] = paddle.to_tensor(inputs["result"]["pred_boxes"])
 
         logits = F.sigmoid(inputs["result"]["pred_logits"])[0]  # (nq, 256)
         boxes = inputs["result"]["pred_boxes"][0]  # (nq, 4)
@@ -174,8 +164,7 @@ def _postprocess(self, inputs):
         pred_phrases = []
         for logit, box in zip(logits_filt, boxes_filt):
             pred_phrase = self._processor.decode(logit > self._text_threshold)
-            pred_phrases.append(pred_phrase +
-                                f"({str(logit.max().item())[:4]})")
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
 
         H, W = self._size[1], self._size[0]
         boxes = []
@@ -239,10 +228,8 @@ def _construct_input_spec(self):
             shape2 = [64, 1, 2]
 
         self._input_spec = [
-            paddle.static.InputSpec(
-                shape=shape, dtype="float32"),
-            paddle.static.InputSpec(
-                shape=shape2, dtype="int32"),
+            paddle.static.InputSpec(shape=shape, dtype="float32"),
+            paddle.static.InputSpec(shape=shape2, dtype="int32"),
         ]
 
     def _construct_processor(self, model):
@@ -250,8 +237,7 @@ def _construct_processor(self, model):
         Construct the tokenizer for the predictor.
         """
         # bulid processor
-        self._processor = SamProcessor.from_pretrained(
-            model, cache_dir=self._model_dir)
+        self._processor = SamProcessor.from_pretrained(model, cache_dir=self._model_dir)
 
     def _construct_model(self, model):
         """
@@ -259,8 +245,7 @@ def _construct_model(self, model):
         """
 
         # bulid model
-        model_instance = SamModel.from_pretrained(
-            model, input_type=self._input_type, cache_dir=self._model_dir)
+        model_instance = SamModel.from_pretrained(model, input_type=self._input_type, cache_dir=self._model_dir)
 
         # Load the model parameter for the predict
         model_instance.eval()
@@ -269,15 +254,13 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
         box_prompt = inputs.get("boxes", None)
         points_prompt = inputs.get("points", None)
-        assert (box_prompt is not None or
-                points_prompt is not None), f"The prompt is None"
+        assert box_prompt is not None or points_prompt is not None, "The prompt is None"
 
         if box_prompt is not None:
-            box_prompt = (box_prompt if isinstance(box_prompt, np.ndarray) else
-                          np.array(box_prompt))
+            box_prompt = box_prompt if isinstance(box_prompt, np.ndarray) else np.array(box_prompt)
         if points_prompt is not None:
             points_prompt = np.array([points_prompt])
 
@@ -285,7 +268,8 @@ def _preprocess(self, inputs):
             image,
             input_type=self._input_type,
             box=box_prompt,
-            point_coords=points_prompt, )
+            point_coords=points_prompt,
+        )
 
         inputs["image_seg"] = image_seg
         inputs["prompt"] = prompt
@@ -306,8 +290,7 @@ def _run_model(self, inputs):
             result = result[0]
 
         else:
-            result = self._model(
-                img=inputs["image_seg"], prompt=inputs["prompt"])
+            result = self._model(img=inputs["image_seg"], prompt=inputs["prompt"])
 
         inputs.pop("image_seg", None)
 
diff --git a/paddlemix/appflow/text2image_generation.py b/paddlemix/appflow/text2image_generation.py
index bebe51fdaf1ef..2b30c0b831245 100644
--- a/paddlemix/appflow/text2image_generation.py
+++ b/paddlemix/appflow/text2image_generation.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-
-from ppdiffusers import (StableDiffusionPipeline,
-                         VersatileDiffusionDualGuidedPipeline)
+from ppdiffusers import StableDiffusionPipeline, VersatileDiffusionDualGuidedPipeline
 
 from .apptask import AppTask
 
@@ -45,7 +42,7 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
 
         return inputs
 
@@ -58,7 +55,8 @@ def _run_model(self, inputs):
             prompt=inputs["prompt"],
             guidance_scale=self._guidance_scale,
             height=self._height,
-            width=self._width, ).images[0]
+            width=self._width,
+        ).images[0]
 
         inputs.pop("prompt", None)
 
@@ -77,8 +75,7 @@ def _postprocess(self, inputs):
 class VersatileDiffusionDualGuidedTask(AppTask):
     def __init__(self, task, model, **kwargs):
         super().__init__(task=task, model=model, **kwargs)
-        self._text_to_image_strength = kwargs.get("text_to_image_strength",
-                                                  0.75)
+        self._text_to_image_strength = kwargs.get("text_to_image_strength", 0.75)
         # Default to static mode
         self._static_mode = False
         self._construct_model(model)
@@ -89,17 +86,16 @@ def _construct_model(self, model):
         """
 
         # bulid model
-        model_instance = VersatileDiffusionDualGuidedPipeline.from_pretrained(
-            model)
+        model_instance = VersatileDiffusionDualGuidedPipeline.from_pretrained(model)
         model_instance.remove_unused_weights()
         self._model = model_instance
 
     def _preprocess(self, inputs):
         """ """
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
 
         return inputs
 
@@ -111,7 +107,8 @@ def _run_model(self, inputs):
         result = self._model(
             prompt=inputs["prompt"],
             image=inputs["image"],
-            text_to_image_strength=self._text_to_image_strength, ).images[0]
+            text_to_image_strength=self._text_to_image_strength,
+        ).images[0]
 
         inputs.pop("prompt", None)
         inputs.pop("image", None)
diff --git a/paddlemix/appflow/text2image_inpaiting.py b/paddlemix/appflow/text2image_inpaiting.py
index 4f363791397cf..311dc5b8acf24 100644
--- a/paddlemix/appflow/text2image_inpaiting.py
+++ b/paddlemix/appflow/text2image_inpaiting.py
@@ -15,7 +15,6 @@
 import paddle
 from PIL import Image
 
-from paddlemix.utils.log import logger
 from ppdiffusers import StableDiffusionInpaintPipeline
 
 from .apptask import AppTask
@@ -45,11 +44,11 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
         seg_masks = inputs.get("seg_masks", None)
-        assert seg_masks is not None, f"The seg masks is None"
+        assert seg_masks is not None, "The seg masks is None"
         inpaint_prompt = inputs.get("inpaint_prompt", None)
-        assert inpaint_prompt is not None, f"The inpaint_prompt is None"
+        assert inpaint_prompt is not None, "The inpaint_prompt is None"
 
         self._org_size = image.size
         merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0)
@@ -72,7 +71,8 @@ def _run_model(self, inputs):
         result = self._model(
             inputs["inpaint_prompt"],
             image=inputs["image"],
-            mask_image=inputs["mask_pil"], ).images[0]
+            mask_image=inputs["mask_pil"],
+        ).images[0]
 
         inputs.pop("mask_pil", None)
         inputs.pop("image", None)
diff --git a/paddlemix/appflow/text2text_generation.py b/paddlemix/appflow/text2text_generation.py
index 0a29e36b71049..9ceb11cbefdb8 100644
--- a/paddlemix/appflow/text2text_generation.py
+++ b/paddlemix/appflow/text2text_generation.py
@@ -14,8 +14,6 @@
 
 from paddlenlp import Taskflow
 
-from paddlemix.utils.log import logger
-
 from .apptask import AppTask
 
 
@@ -41,15 +39,16 @@ def _construct_model(self, model):
     def _preprocess(self, inputs):
         """ """
         image = inputs.get("image", None)
-        assert image is not None, f"The image is None"
+        assert image is not None, "The image is None"
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
 
         prompt = (
             "Given caption,extract the main object to be replaced and marked it as 'main_object', "
-            + f"Extract the remaining part as 'other prompt', " +
-            f"Return main_object, other prompt in English" +
-            f"Given caption: {prompt}.")
+            + "Extract the remaining part as 'other prompt', "
+            + "Return main_object, other prompt in English"
+            + "Given caption: {}.".format(prompt)
+        )
 
         inputs["prompt"] = prompt
 
@@ -74,7 +73,8 @@ def _postprocess(self, inputs):
 
         prompt, inpaint_prompt = (
             inputs["result"].split("\n")[0].split(":")[-1].strip(),
-            inputs["result"].split("\n")[-1].split(":")[-1].strip(), )
+            inputs["result"].split("\n")[-1].split(":")[-1].strip(),
+        )
 
         inputs.pop("result", None)
 
diff --git a/paddlemix/appflow/text2video_generation.py b/paddlemix/appflow/text2video_generation.py
index 77b374eaabecb..290917706c77a 100644
--- a/paddlemix/appflow/text2video_generation.py
+++ b/paddlemix/appflow/text2video_generation.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
 
 from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
 
@@ -34,14 +33,13 @@ def _construct_model(self, model):
 
         # bulid model
         model_instance = TextToVideoSDPipeline.from_pretrained(model)
-        model_instance.scheduler = DPMSolverMultistepScheduler.from_config(
-            model_instance.scheduler.config)
+        model_instance.scheduler = DPMSolverMultistepScheduler.from_config(model_instance.scheduler.config)
         self._model = model_instance
 
     def _preprocess(self, inputs):
         """ """
         prompt = inputs.get("prompt", None)
-        assert prompt is not None, f"The prompt is None"
+        assert prompt is not None, "The prompt is None"
         num_inference_steps = inputs.get("num_inference_steps", 25)
         inputs["num_inference_steps"] = num_inference_steps
 
@@ -54,7 +52,8 @@ def _run_model(self, inputs):
 
         result = self._model(
             prompt=inputs["prompt"],
-            num_inference_steps=inputs["num_inference_steps"], ).frames
+            num_inference_steps=inputs["num_inference_steps"],
+        ).frames
 
         inputs.pop("prompt", None)
 
diff --git a/paddlemix/checkpoint.py b/paddlemix/checkpoint.py
index 3d885eab2494a..81d44482730d2 100644
--- a/paddlemix/checkpoint.py
+++ b/paddlemix/checkpoint.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import shutil
 
 import paddle
 
@@ -25,21 +26,17 @@ def save(args, model, optimizer, epoch=0, step=0, output_dir="", is_best=False):
         return
 
     if output_dir and isinstance(output_dir, str):
-        output_dir = os.path.join(output_dir,
-                                  "epoch_%d_step_%d" % (epoch, step))
+        output_dir = os.path.join(output_dir, "epoch_%d_step_%d" % (epoch, step))
         if not os.path.exists(output_dir):
             os.makedirs(output_dir, exist_ok=True)
         print("Save model to %s" % output_dir)
 
-        save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(
-            output_dir, args.mp_rank, args.sharding_rank)
+        save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(output_dir, args.mp_rank, args.sharding_rank)
 
         # if args.sharding_stage == 3:
         #     model.get_all_parameters(convert2cpu=False)
-        paddle.save(model.state_dict(),
-                    os.path.join(save_dir, "model.pdparams"))
-        paddle.save(optimizer.state_dict(),
-                    os.path.join(save_dir, "model_state.pdopt"))
+        paddle.save(model.state_dict(), os.path.join(save_dir, "model.pdparams"))
+        paddle.save(optimizer.state_dict(), os.path.join(save_dir, "model_state.pdopt"))
         if is_best:
             shutil.copyfile("model.pdparams", "model_best.pdparams")
         meta_dict = {
@@ -60,19 +57,15 @@ def load_model(args, model, optimizer=None, ckpt_dir=""):
     if ckpt_dir and isinstance(ckpt_dir, str) and os.path.isdir(ckpt_dir):
         print("Try to load checkpoint from %s " % ckpt_dir)
 
-        load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(
-            ckpt_dir, args.mp_rank, args.sharding_rank)
+        load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(ckpt_dir, args.mp_rank, args.sharding_rank)
         model_path = os.path.join(load_dir, "model.pdparams")
         opt_path = os.path.join(load_dir, "model_state.pdopt")
-        meta_path = os.path.join(load_dir, "meta_state.pdopt")
+        # meta_path = os.path.join(load_dir, "meta_state.pdopt")
 
         if os.path.exists(model_path):
             model_dict = paddle.load(model_path)
             for name, param in model.state_dict().items():
-                assert (
-                    name in model_dict.keys()
-                ), "No param named `{}` was found in checkpoint file.".format(
-                    name)
+                assert name in model_dict.keys(), "No param named `{}` was found in checkpoint file.".format(name)
 
                 if param.dtype != model_dict[name].dtype:
                     model_dict[name] = model_dict[name].cast(param.dtype)
@@ -125,9 +118,7 @@ def load_model(args, model, optimizer=None, ckpt_dir=""):
         ]
         rowlinear_list = []
         all_list = collinear_list + rowlinear_list + embedding_list
-        skip_list = [
-            "visual.patch_embed.proj.weight", "visual.patch_embed.proj.bias"
-        ]
+        skip_list = ["visual.patch_embed.proj.weight", "visual.patch_embed.proj.bias"]
 
         col_list = []
         row_list = []
@@ -148,22 +139,21 @@ def renamebias(model_dict, whole_key):
         def col_split_modeldict(model_dict):
             if len(model_dict.shape) == 2:
                 subbatch = model_dict.shape[1] // mp_size
-                return model_dict[:, mp_rank * subbatch:(mp_rank + 1) *
-                                  subbatch]
+                return model_dict[:, mp_rank * subbatch : (mp_rank + 1) * subbatch]
             elif len(model_dict.shape) == 1:
                 subbatch = model_dict.shape[0] // mp_size
-                return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch]
+                return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
 
         def row_split_modeldict(model_dict):
             if len(model_dict.shape) == 2:
                 subbatch = model_dict.shape[0] // mp_size
-                return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch]
+                return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
             else:
                 return model_dict
 
         def emb_split_modeldict(model_dict):
             subbatch = model_dict.shape[0] // mp_size
-            return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch]
+            return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
 
         model_dict = paddle.load(ckpt_dir)
         modelkeys = list(model_dict.keys())
@@ -180,28 +170,22 @@ def emb_split_modeldict(model_dict):
             if key in all_list:
                 if key in collinear_list:
                     col_list.append((key, model_dict[whole_key].shape))
-                    model_dict[whole_key] = col_split_modeldict(model_dict[
-                        whole_key])
+                    model_dict[whole_key] = col_split_modeldict(model_dict[whole_key])
                 elif key in rowlinear_list:
                     row_list.append((key, model_dict[whole_key].shape))
-                    model_dict[whole_key] = row_split_modeldict(model_dict[
-                        whole_key])
+                    model_dict[whole_key] = row_split_modeldict(model_dict[whole_key])
                 else:
                     emb_list.append((key, model_dict[whole_key].shape))
-                    model_dict[whole_key] = emb_split_modeldict(model_dict[
-                        whole_key])
+                    model_dict[whole_key] = emb_split_modeldict(model_dict[whole_key])
 
         if args.context_length != 77:
-            model_dict["text.positional_embedding"] = model_dict[
-                "text.positional_embedding"][:args.context_length, :]
+            model_dict["text.positional_embedding"] = model_dict["text.positional_embedding"][: args.context_length, :]
 
-        print("cast state_dict to default dtype:{}".format(
-            paddle.get_default_dtype()))
+        print("cast state_dict to default dtype:{}".format(paddle.get_default_dtype()))
         for key, value in model_dict.items():
             if "freqs_cos" in key or "freqs_sin" in key:
                 continue
-            model_dict[key] = paddle.cast(
-                value, dtype=paddle.get_default_dtype())
+            model_dict[key] = paddle.cast(value, dtype=paddle.get_default_dtype())
         model.set_state_dict(model_dict)
         del model_dict
     else:
diff --git a/paddlemix/datasets/caption_dataset.py b/paddlemix/datasets/caption_dataset.py
index 54ab650f3ddfe..3bcff989fe2bf 100644
--- a/paddlemix/datasets/caption_dataset.py
+++ b/paddlemix/datasets/caption_dataset.py
@@ -33,25 +33,27 @@ class CaptionDataset(DatasetBuilder):
     """
 
     URL = "https://bj.bcebos.com/paddlemix/datasets/coco.tar.gz"
-    META_INFO = collections.namedtuple(
-        "META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
+    META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
     MD5 = ""
     SPLITS = {
         "train": META_INFO(
             os.path.join("coco", "images"),
             os.path.join("coco", "annotations/coco_karpathy_train.json"),
             "",
-            "aa31ac474cf6250ebb81d18348a07ed8", ),
+            "aa31ac474cf6250ebb81d18348a07ed8",
+        ),
         "val": META_INFO(
             os.path.join("coco", "images"),
             os.path.join("coco", "annotations/coco_karpathy_val.json"),
             "",
-            "b273847456ef5580e33713b1f7de52a0", ),
+            "b273847456ef5580e33713b1f7de52a0",
+        ),
         "test": META_INFO(
             os.path.join("coco", "images"),
             os.path.join("coco", "annotations/coco_karpathy_test.json"),
             "",
-            "3ff34b0ef2db02d01c37399f6a2a6cd1", ),
+            "3ff34b0ef2db02d01c37399f6a2a6cd1",
+        ),
     }
 
     def _get_data(self, mode, **kwargs):
@@ -108,7 +110,6 @@ def _read(self, filename, *args):
             else:
                 yield_data = {
                     "image": image_path,
-                    "image_id": ann["image"].split("/")[-1].strip(".jpg")
-                    .split("_")[-1],
+                    "image_id": ann["image"].split("/")[-1].strip(".jpg").split("_")[-1],
                 }
             yield yield_data
diff --git a/paddlemix/datasets/coco_clip.py b/paddlemix/datasets/coco_clip.py
index 17166b44859d9..7dcd6e6c26661 100644
--- a/paddlemix/datasets/coco_clip.py
+++ b/paddlemix/datasets/coco_clip.py
@@ -26,25 +26,27 @@
 class CaptionCLIP(DatasetBuilder):
 
     URL = "https://bj.bcebos.com/paddlemix/datasets/coco.tar.gz"
-    META_INFO = collections.namedtuple(
-        "META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
+    META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
     MD5 = ""
     SPLITS = {
         "train": META_INFO(
             os.path.join("coco", "images"),
             os.path.join("coco", "annotations/coco_karpathy_train.json"),
             "",
-            "aa31ac474cf6250ebb81d18348a07ed8", ),
+            "aa31ac474cf6250ebb81d18348a07ed8",
+        ),
         "val": META_INFO(
             os.path.join("coco", "images"),
             os.path.join("coco", "annotations/coco_karpathy_val.json"),
             "",
-            "b273847456ef5580e33713b1f7de52a0", ),
+            "b273847456ef5580e33713b1f7de52a0",
+        ),
         "test": META_INFO(
             os.path.join("coco", "images"),
             os.path.join("coco", "annotations/coco_karpathy_test.json"),
             "",
-            "3ff34b0ef2db02d01c37399f6a2a6cd1", ),
+            "3ff34b0ef2db02d01c37399f6a2a6cd1",
+        ),
     }
 
     def _get_data(self, mode, **kwargs):
@@ -74,7 +76,6 @@ def _gen_image_id(self, anno):
     def _read(self, filename, *args):
         image_root, anno_path, mode = filename
         annotations = json.load(open(anno_path, "r"))
-        image_ids = self._gen_image_id(annotations)
 
         for ann in annotations:
             image_path = os.path.join(image_root, ann["image"])
diff --git a/paddlemix/datasets/dataset.py b/paddlemix/datasets/dataset.py
index 96452fb68de78..047bbfd796e57 100644
--- a/paddlemix/datasets/dataset.py
+++ b/paddlemix/datasets/dataset.py
@@ -64,7 +64,7 @@ class DatasetTuple:
     def __init__(self, splits):
         self.identifier_map, identifiers = self._gen_identifier_map(splits)
         self.tuple_cls = namedtuple("datasets", identifiers)
-        self.tuple = self.tuple_cls(* [None for _ in splits])
+        self.tuple = self.tuple_cls(*[None for _ in splits])
 
     def __getitem__(self, key):
         if isinstance(key, (int, slice)):
@@ -116,8 +116,7 @@ def load_from_hf(path, name=None, splits=None, **kwargs):
     try:
         hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs)
     except FileNotFoundError:
-        raise FileNotFoundError("Couldn't find the dataset script for '" + path
-                                + "' on PaddleNLP or HuggingFace")
+        raise FileNotFoundError("Couldn't find the dataset script for '" + path + "' on PaddleNLP or HuggingFace")
     else:
         label_list = []
         if isinstance(hf_datasets, DatasetDict):
@@ -133,8 +132,7 @@ def load_from_hf(path, name=None, splits=None, **kwargs):
                 for feature in hf_datasets[i].features.values():
                     if isinstance(feature, ClassLabel):
                         label_list = feature.names
-                datasets[split] = MapDataset(
-                    hf_datasets[i], label_list=label_list)
+                datasets[split] = MapDataset(hf_datasets[i], label_list=label_list)
         else:
             for feature in hf_datasets.features.values():
                 if isinstance(feature, ClassLabel):
@@ -143,12 +141,7 @@ def load_from_hf(path, name=None, splits=None, **kwargs):
     return datasets
 
 
-def load_dataset(path_or_read_func,
-                 name=None,
-                 data_files=None,
-                 splits=None,
-                 lazy=None,
-                 **kwargs):
+def load_dataset(path_or_read_func, name=None, data_files=None, splits=None, lazy=None, **kwargs):
     """
     This method will load a dataset, either form PaddleNLP library or from a
     self-defined data loading script, by calling functions in `DatasetBuilder`.
@@ -197,26 +190,22 @@ def load_dataset(path_or_read_func,
         try:
             reader_cls = import_main_class(path_or_read_func)
         except ModuleNotFoundError:
-            datasets = load_from_hf(
-                path_or_read_func, name=name, splits=splits, **kwargs)
+            datasets = load_from_hf(path_or_read_func, name=name, splits=splits, **kwargs)
         else:
             reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)
 
             # Check if selected name and split is valid in this DatasetBuilder
             if hasattr(reader_instance, "BUILDER_CONFIGS"):
                 if name in reader_cls.BUILDER_CONFIGS.keys():
-                    split_names = reader_cls.BUILDER_CONFIGS[name][
-                        "splits"].keys()
+                    split_names = reader_cls.BUILDER_CONFIGS[name]["splits"].keys()
                 else:
                     raise ValueError(
-                        'Invalid name "{}". Should be one of {}.'.format(
-                            name, list(reader_cls.BUILDER_CONFIGS.keys())))
+                        'Invalid name "{}". Should be one of {}.'.format(name, list(reader_cls.BUILDER_CONFIGS.keys()))
+                    )
             elif hasattr(reader_instance, "SPLITS"):
                 split_names = reader_instance.SPLITS.keys()
             else:
-                raise AttributeError(
-                    "Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder."
-                )
+                raise AttributeError("Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder.")
 
             selected_splits = []
             if isinstance(splits, list) or isinstance(splits, tuple):
@@ -226,11 +215,9 @@ def load_dataset(path_or_read_func,
 
             for split_name in selected_splits:
                 if split_name not in split_names and split_name is not None:
-                    raise ValueError('Invalid split "{}". Should be one of {}.'.
-                                     format(split_name, list(split_names)))
+                    raise ValueError('Invalid split "{}". Should be one of {}.'.format(split_name, list(split_names)))
 
-            datasets = reader_instance.read_datasets(
-                data_files=data_files, splits=splits)
+            datasets = reader_instance.read_datasets(data_files=data_files, splits=splits)
         return datasets
 
 
@@ -268,8 +255,7 @@ def __getitem__(self, idx):
         Basic function of `MapDataset` to get sample from dataset with a given
         index.
         """
-        return (self._transform(self.new_data[idx])
-                if self._transform_pipline else self.new_data[idx])
+        return self._transform(self.new_data[idx]) if self._transform_pipline else self.new_data[idx]
 
     def __len__(self):
         """
@@ -291,21 +277,12 @@ def filter(self, fn, num_workers=0):
         assert num_workers >= 0, "num_workers should be a non-negative value"
         if num_workers > 1:
             shards = [
-                self._shard(
-                    num_shards=num_workers, index=index, contiguous=True)
-                for index in range(num_workers)
-            ]
-            kwds_per_shard = [
-                dict(
-                    self=shards[rank], fn=fn) for rank in range(num_workers)
+                self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers)
             ]
-            pool = Pool(num_workers, initargs=(RLock(), ))
+            kwds_per_shard = [dict(self=shards[rank], fn=fn) for rank in range(num_workers)]
+            pool = Pool(num_workers, initargs=(RLock(),))
 
-            results = [
-                pool.apply_async(
-                    self.__class__._filter, kwds=kwds)
-                for kwds in kwds_per_shard
-            ]
+            results = [pool.apply_async(self.__class__._filter, kwds=kwds) for kwds in kwds_per_shard]
             transformed_shards = [r.get() for r in results]
 
             pool.close()
@@ -318,15 +295,11 @@ def filter(self, fn, num_workers=0):
             return self._filter(fn)
 
     def _filter(self, fn):
-        self.new_data = [
-            self.new_data[idx] for idx in range(len(self.new_data))
-            if fn(self.new_data[idx])
-        ]
+        self.new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if fn(self.new_data[idx])]
         return self
 
     def shard(self, num_shards=None, index=None, contiguous=False):
-        self.new_data = self._shard(
-            num_shards=num_shards, index=index, contiguous=contiguous).data
+        self.new_data = self._shard(num_shards=num_shards, index=index, contiguous=contiguous).data
         return self
 
     def _shard(self, num_shards=None, index=None, contiguous=False):
@@ -359,10 +332,7 @@ def _shard(self, num_shards=None, index=None, contiguous=False):
             end = start + div + (1 if index < mod else 0)
             new_data = [self.new_data[idx] for idx in range(start, end)]
         else:
-            new_data = [
-                self.new_data[idx] for idx in range(len(self.new_data))
-                if idx % num_shards == index
-            ]
+            new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if idx % num_shards == index]
 
         return MapDataset(new_data)
 
@@ -388,20 +358,13 @@ def map(self, fn, lazy=True, batched=False, num_workers=0):
         assert num_workers >= 0, "num_workers should be a non-negative value"
         if num_workers > 1:
             shards = [
-                self._shard(
-                    num_shards=num_workers, index=index, contiguous=True)
-                for index in range(num_workers)
+                self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers)
             ]
             kwds_per_shard = [
-                dict(
-                    self=shards[rank], fn=fn, lazy=False, batched=batched)
-                for rank in range(num_workers)
-            ]
-            pool = Pool(num_workers, initargs=(RLock(), ))
-            results = [
-                pool.apply_async(
-                    self.__class__._map, kwds=kwds) for kwds in kwds_per_shard
+                dict(self=shards[rank], fn=fn, lazy=False, batched=batched) for rank in range(num_workers)
             ]
+            pool = Pool(num_workers, initargs=(RLock(),))
+            results = [pool.apply_async(self.__class__._map, kwds=kwds) for kwds in kwds_per_shard]
             transformed_shards = [r.get() for r in results]
             pool.close()
             pool.join()
@@ -418,9 +381,7 @@ def _map(self, fn, lazy=True, batched=False):
         elif lazy:
             self._transform_pipline.append(fn)
         else:
-            self.new_data = [
-                fn(self.new_data[idx]) for idx in range(len(self.new_data))
-            ]
+            self.new_data = [fn(self.new_data[idx]) for idx in range(len(self.new_data))]
         return self
 
 
@@ -468,23 +429,19 @@ def __iter__(self):
         num_samples = 0
         if inspect.isfunction(self.data):
             for example in self.data():
-                if (not self._filter_pipline or
-                        self._filter(self._filter_pipline)
-                    ) and self._shard_filter(num_samples=num_samples):
-                    yield self._transform(
-                        example) if self._transform_pipline else example
+                if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
+                    num_samples=num_samples
+                ):
+                    yield self._transform(example) if self._transform_pipline else example
                 num_samples += 1
         else:
             if inspect.isgenerator(self.data):
-                warnings.warn(
-                    "Reciving generator as data source, data can only be iterated once"
-                )
+                warnings.warn("Reciving generator as data source, data can only be iterated once")
             for example in self.data:
-                if (not self._filter_pipline or
-                        self._filter(self._filter_pipline)
-                    ) and self._shard_filter(num_samples=num_samples):
-                    yield self._transform(
-                        example) if self._transform_pipline else example
+                if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
+                    num_samples=num_samples
+                ):
+                    yield self._transform(example) if self._transform_pipline else example
                 num_samples += 1
 
     def filter(self, fn):
@@ -578,22 +535,23 @@ def remove_if_exit(filepath):
 
         if data_files is None:
             if splits is None:
-                splits = (list(self.BUILDER_CONFIGS[self.name]["splits"].keys())
-                          if hasattr(self, "BUILDER_CONFIGS") else
-                          list(self.SPLITS.keys()))
+                splits = (
+                    list(self.BUILDER_CONFIGS[self.name]["splits"].keys())
+                    if hasattr(self, "BUILDER_CONFIGS")
+                    else list(self.SPLITS.keys())
+                )
 
             assert (
-                isinstance(splits, str) or
-                (isinstance(splits, list) and isinstance(splits[0], str)) or
-                (isinstance(splits, tuple) and isinstance(splits[0], str))
+                isinstance(splits, str)
+                or (isinstance(splits, list) and isinstance(splits[0], str))
+                or (isinstance(splits, tuple) and isinstance(splits[0], str))
             ), "`splits` should be a string or list of string or a tuple of string."
 
             if isinstance(splits, str):
                 splits = [splits]
             datasets = DatasetTuple(splits)
             parallel_env = dist.ParallelEnv()
-            unique_endpoints = _get_unique_endpoints(
-                parallel_env.trainer_endpoints[:])
+            unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:])
             # move register hook to first and register togather
             lock_files = []
             for split in splits:
@@ -625,8 +583,7 @@ def remove_if_exit(filepath):
                 datasets[split] = self.read(filename=filename, split=split)
         else:
             assert (
-                isinstance(data_files, str) or isinstance(data_files, tuple) or
-                isinstance(data_files, list)
+                isinstance(data_files, str) or isinstance(data_files, tuple) or isinstance(data_files, list)
             ), "`data_files` should be a string or tuple or list of strings."
             if isinstance(data_files, str):
                 data_files = [data_files]
@@ -639,14 +596,11 @@ def remove_if_exit(filepath):
                     data_files
                 ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file."
                 for i in range(len(data_files)):
-                    datasets[splits[i]] = self.read(
-                        filename=data_files[i], split=splits[i])
+                    datasets[splits[i]] = self.read(filename=data_files[i], split=splits[i])
             else:
-                datasets = DatasetTuple(
-                    ["split" + str(i) for i in range(len(data_files))])
+                datasets = DatasetTuple(["split" + str(i) for i in range(len(data_files))])
                 for i in range(len(data_files)):
-                    datasets["split" + str(i)] = self.read(
-                        filename=data_files[i], split=default_split)
+                    datasets["split" + str(i)] = self.read(filename=data_files[i], split=default_split)
 
         return datasets if len(datasets) > 1 else datasets[0]
 
@@ -701,9 +655,9 @@ def _convert_label_to_id(labels, label_dict):
         if self.lazy:
 
             def generate_examples():
-                generator = (self._read(filename, split)
-                             if self._read.__code__.co_argcount > 2 else
-                             self._read(filename))
+                generator = (
+                    self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename)
+                )
                 for example in generator:
                     # We need to check if the example contains label column and confirm its name.
                     # For now we only allow `label` or `labels` to be the name of label column.
@@ -720,24 +674,17 @@ def generate_examples():
                         # For multiple labels in the form of list.
                         if isinstance(label_dict, list):
                             for idx, sub_dict in enumerate(label_dict):
-                                example[label_col][idx] = _convert_label_to_id(
-                                    example[label_col][idx], sub_dict)
+                                example[label_col][idx] = _convert_label_to_id(example[label_col][idx], sub_dict)
                         else:
-                            example[label_col] = _convert_label_to_id(
-                                example[label_col], label_dict)
+                            example[label_col] = _convert_label_to_id(example[label_col], label_dict)
 
                         yield example
                     else:
                         yield example
 
-            return IterDataset(
-                generate_examples(),
-                label_list=label_list,
-                vocab_info=vocab_info)
+            return IterDataset(generate_examples(), label_list=label_list, vocab_info=vocab_info)
         else:
-            examples = (self._read(filename, split)
-                        if self._read.__code__.co_argcount > 2 else
-                        self._read(filename))
+            examples = self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename)
 
             # Then some validation.
             if not isinstance(examples, list):
@@ -745,8 +692,8 @@ def generate_examples():
 
             if not examples:
                 raise ValueError(
-                    "No instances were read from the given filepath {}. "
-                    "Is the path correct?".format(filename))
+                    "No instances were read from the given filepath {}. " "Is the path correct?".format(filename)
+                )
 
             # We need to check if the example contains label column and confirm its name.
             # For now we only allow `label` or `labels` to be the name of label column.
@@ -764,14 +711,11 @@ def generate_examples():
                     # For multiple labels in the form of list.
                     if isinstance(label_dict, list):
                         for i, sub_dict in enumerate(label_dict):
-                            examples[idx][label_col][i] = _convert_label_to_id(
-                                examples[idx][label_col][i], sub_dict)
+                            examples[idx][label_col][i] = _convert_label_to_id(examples[idx][label_col][i], sub_dict)
                     else:
-                        examples[idx][label_col] = _convert_label_to_id(
-                            examples[idx][label_col], label_dict)
+                        examples[idx][label_col] = _convert_label_to_id(examples[idx][label_col], label_dict)
 
-            return MapDataset(
-                examples, label_list=label_list, vocab_info=vocab_info)
+            return MapDataset(examples, label_list=label_list, vocab_info=vocab_info)
 
     def _read(self, filename: str, *args):
         """
@@ -820,15 +764,13 @@ def generate_examples():
             return IterDataset(generate_examples)
         else:
             examples = self._read(**kwargs)
-            if hasattr(examples, "__len__") and hasattr(examples,
-                                                        "__getitem__"):
+            if hasattr(examples, "__len__") and hasattr(examples, "__getitem__"):
                 return MapDataset(examples)
             else:
                 return MapDataset(list(examples))
 
 
-def has_file_allowed_extension(filename: str,
-                               extensions: Union[str, Tuple[str, ...]]) -> bool:
+def has_file_allowed_extension(filename: str, extensions: Union[str, Tuple[str, ...]]) -> bool:
     """Checks if a file is an allowed extension.
 
     Args:
@@ -838,8 +780,7 @@ def has_file_allowed_extension(filename: str,
     Returns:
         bool: True if the filename ends with one of given extensions
     """
-    return filename.lower().endswith(
-        extensions if isinstance(extensions, str) else tuple(extensions))
+    return filename.lower().endswith(extensions if isinstance(extensions, str) else tuple(extensions))
 
 
 def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
@@ -847,22 +788,20 @@ def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
 
     See :class:`DatasetFolder` for details.
     """
-    classes = sorted(
-        entry.name for entry in os.scandir(directory) if entry.is_dir())
+    classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
     if not classes:
-        raise FileNotFoundError(
-            f"Couldn't find any class folder in {directory}.")
+        raise FileNotFoundError(f"Couldn't find any class folder in {directory}.")
 
     class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
     return classes, class_to_idx
 
 
 def make_dataset(
-        directory: str,
-        class_to_idx: Optional[Dict[str, int]]=None,
-        extensions: Optional[Union[str, Tuple[str, ...]]]=None,
-        is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[Tuple[
-            str, int]]:
+    directory: str,
+    class_to_idx: Optional[Dict[str, int]] = None,
+    extensions: Optional[Union[str, Tuple[str, ...]]] = None,
+    is_valid_file: Optional[Callable[[str], bool]] = None,
+) -> List[Tuple[str, int]]:
     """Generates a list of samples of a form (path_to_sample, class).
 
     See :class:`DatasetFolder` for details.
@@ -875,22 +814,17 @@ def make_dataset(
     if class_to_idx is None:
         _, class_to_idx = find_classes(directory)
     elif not class_to_idx:
-        raise ValueError(
-            "'class_to_index' must have at least one entry to collect any samples."
-        )
+        raise ValueError("'class_to_index' must have at least one entry to collect any samples.")
 
     both_none = extensions is None and is_valid_file is None
     both_something = extensions is not None and is_valid_file is not None
     if both_none or both_something:
-        raise ValueError(
-            "Both extensions and is_valid_file cannot be None or not None at the same time"
-        )
+        raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time")
 
     if extensions is not None:
 
         def is_valid_file(x: str) -> bool:
-            return has_file_allowed_extension(
-                x, extensions)  # type: ignore[arg-type]
+            return has_file_allowed_extension(x, extensions)  # type: ignore[arg-type]
 
     is_valid_file = cast(Callable[[str], bool], is_valid_file)
 
@@ -913,9 +847,7 @@ def is_valid_file(x: str) -> bool:
 
     empty_classes = set(class_to_idx.keys()) - available_classes
     if empty_classes:
-        msg = (
-            f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. "
-        )
+        msg = f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. "
         if extensions is not None:
             msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}"
         raise FileNotFoundError(msg)
@@ -951,13 +883,14 @@ class DatasetFolder(Dataset):
     """
 
     def __init__(
-            self,
-            root: str,
-            loader: Callable[[str], Any],
-            extensions: Optional[Tuple[str, ...]]=None,
-            transform: Optional[Callable]=None,
-            target_transform: Optional[Callable]=None,
-            is_valid_file: Optional[Callable[[str], bool]]=None, ) -> None:
+        self,
+        root: str,
+        loader: Callable[[str], Any],
+        extensions: Optional[Tuple[str, ...]] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> None:
         # super().__init__(root, transform=transform, target_transform=target_transform)
         # super().__init__()
         self.root = root
@@ -965,8 +898,7 @@ def __init__(
         self.target_transform = target_transform
 
         classes, class_to_idx = self.find_classes(self.root)
-        samples = self.make_dataset(self.root, class_to_idx, extensions,
-                                    is_valid_file)
+        samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file)
 
         self.loader = loader
         self.extensions = extensions
@@ -978,11 +910,11 @@ def __init__(
 
     @staticmethod
     def make_dataset(
-            directory: str,
-            class_to_idx: Dict[str, int],
-            extensions: Optional[Tuple[str, ...]]=None,
-            is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[
-                Tuple[str, int]]:
+        directory: str,
+        class_to_idx: Dict[str, int],
+        extensions: Optional[Tuple[str, ...]] = None,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> List[Tuple[str, int]]:
         """Generates a list of samples of a form (path_to_sample, class).
 
         This can be overridden to e.g. read files from a compressed zip file instead of from the disk.
@@ -1010,11 +942,7 @@ def make_dataset(
             # find_classes() function, instead of using that of the find_classes() method, which
             # is potentially overridden and thus could have a different logic.
             raise ValueError("The class_to_idx parameter cannot be None.")
-        return make_dataset(
-            directory,
-            class_to_idx,
-            extensions=extensions,
-            is_valid_file=is_valid_file)
+        return make_dataset(directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file)
 
     def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
         """Find the class folders in a dataset structured as follows::
@@ -1075,7 +1003,8 @@ def __len__(self) -> int:
     ".pgm",
     ".tif",
     ".tiff",
-    ".webp", )
+    ".webp",
+)
 
 
 def pil_loader(path: str) -> Image.Image:
@@ -1120,17 +1049,19 @@ class ImageFolder(DatasetFolder):
     """
 
     def __init__(
-            self,
-            root: str,
-            transform: Optional[Callable]=None,
-            target_transform: Optional[Callable]=None,
-            loader: Callable[[str], Any]=default_loader,
-            is_valid_file: Optional[Callable[[str], bool]]=None, ):
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ):
         super().__init__(
             root,
             loader,
             IMG_EXTENSIONS if is_valid_file is None else None,
             transform=transform,
             target_transform=target_transform,
-            is_valid_file=is_valid_file, )
+            is_valid_file=is_valid_file,
+        )
         self.imgs = self.samples
diff --git a/paddlemix/datasets/laion_clip.py b/paddlemix/datasets/laion_clip.py
index 24edc6a38e340..4d4fa6c5a104f 100644
--- a/paddlemix/datasets/laion_clip.py
+++ b/paddlemix/datasets/laion_clip.py
@@ -11,19 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
-import gzip
-import io
+
 import logging
 import os
-import random
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
-import paddle
 import paddle.vision.datasets as datasets
 from easydict import EasyDict as edict
-from paddle.io import DataLoader, Dataset, IterableDataset, get_worker_info
-from PIL import Image
+from paddle.io import DataLoader
 
 from .dataset import ImageFolder
 
@@ -39,8 +33,7 @@ def get_classification(args, preprocess_fns):
     for data_path in data_paths:
         data_path = data_path.rstrip("/")
         logging.info(f"adding classification dataset: {data_path}")
-        dataset = datasets.ImageFolder(
-            f"{data_path}/images", transform=preprocess_fn)
+        dataset = datasets.ImageFolder(f"{data_path}/images", transform=preprocess_fn)
 
         dataset = ImageFolder(f"{data_path}/images", transform=preprocess_fn)
 
@@ -48,7 +41,8 @@ def get_classification(args, preprocess_fns):
             dataset,
             batch_size=args.per_device_eval_batch_size,  # hard code
             num_workers=args.dataloader_num_workers,
-            shuffle=False, )
+            shuffle=False,
+        )
 
         classname_filename = f"{data_path}/labels.txt"
         template_filename = f"{data_path}/templates.txt"
@@ -56,7 +50,8 @@ def get_classification(args, preprocess_fns):
         result[f"{os.path.basename(data_path)}"] = edict(
             dataloader=dataloader,
             classname_filename=classname_filename,
-            template_filename=template_filename, )
+            template_filename=template_filename,
+        )
 
     return result
 
diff --git a/paddlemix/examples/Sam/run_predict.py b/paddlemix/examples/Sam/run_predict.py
index 46591f2abd0cd..cbfe59f068785 100644
--- a/paddlemix/examples/Sam/run_predict.py
+++ b/paddlemix/examples/Sam/run_predict.py
@@ -18,11 +18,9 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-import paddle
-import paddle.nn.functional as F
 import requests
 from paddlenlp.trainer import PdArgumentParser
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image
 
 from paddlemix.models.sam.modeling import SamModel
 from paddlemix.processors.sam_processing import SamProcessor
@@ -56,11 +54,8 @@ class DataArguments:
     """
 
     input_image: str = field(metadata={"help": "The name of input image."})
-    box_prompt: List[int] = field(
-        default=None, metadata={"help": "box promt format as xyxyxyxy...]."})
-    points_prompt: List[int] = field(
-        default=None,
-        metadata={"help": "point promt format as [[xy],[xy]...]."})
+    box_prompt: List[int] = field(default=None, metadata={"help": "box promt format as xyxyxyxy...]."})
+    points_prompt: List[int] = field(default=None, metadata={"help": "point promt format as [[xy],[xy]...]."})
 
 
 @dataclass
@@ -71,19 +66,20 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="Sam/SamVitH-1024",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     input_type: str = field(
         default="boxs",
-        metadata={
-            "help":
-            "The model prompt type, choices ['boxs', 'points', 'points_grid']."
-        }, )
+        metadata={"help": "The model prompt type, choices ['boxs', 'points', 'points_grid']."},
+    )
     output_dir: str = field(
         default="seg_output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def main():
@@ -94,15 +90,13 @@ def main():
         # read image
         image_pil = Image.open(data_args.input_image).convert("RGB")
     else:
-        image_pil = Image.open(requests.get(url, stream=True).raw).convert(
-            "RGB")
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
     # bulid processor
     processor = SamProcessor.from_pretrained(model_args.model_name_or_path)
     # bulid model
     logger.info("SamModel: {}".format(model_args.model_name_or_path))
-    sam_model = SamModel.from_pretrained(
-        model_args.model_name_or_path, input_type=model_args.input_type)
+    sam_model = SamModel.from_pretrained(model_args.model_name_or_path, input_type=model_args.input_type)
 
     if data_args.box_prompt is not None:
         data_args.box_prompt = np.array(data_args.box_prompt)
@@ -113,7 +107,8 @@ def main():
         image_pil,
         input_type=model_args.input_type,
         box=data_args.box_prompt,
-        point_coords=data_args.points_prompt, )
+        point_coords=data_args.points_prompt,
+    )
     seg_masks = sam_model(img=image_seg, prompt=prompt)
     seg_masks = processor.postprocess_masks(seg_masks)
 
@@ -131,7 +126,8 @@ def main():
             os.path.join(model_args.output_dir, "mask_pred.jpg"),
             bbox_inches="tight",
             dpi=300,
-            pad_inches=0.0, )
+            pad_inches=0.0,
+        )
 
 
 if __name__ == "__main__":
diff --git a/paddlemix/examples/blip2/export.py b/paddlemix/examples/blip2/export.py
index 745b2f3f5ef7f..d206f4f47a749 100644
--- a/paddlemix/examples/blip2/export.py
+++ b/paddlemix/examples/blip2/export.py
@@ -11,24 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../.."))
 import os
-sys.path.insert(
-    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
 from dataclasses import dataclass, field
+
 import paddle
-import requests
+import yaml
 from paddlenlp.trainer import PdArgumentParser
-from PIL import Image
+
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
-from paddlemix.processors.blip_processing import Blip2Processor
 from paddlemix.utils.log import logger
-import os
-import yaml
-import paddle
-import argparse
-import os
-import paddle
 
 
 @dataclass
@@ -41,13 +36,11 @@ class DataArguments:
     """
 
     input_image: str = field(
-        default="http://images.cocodataset.org/val2017/000000039769.jpg",
-        metadata={"help": "The name of input image."
-                  })  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+        default="http://images.cocodataset.org/val2017/000000039769.jpg", metadata={"help": "The name of input image."}
+    )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
     prompt: str = field(
-        default=None,
-        metadata={"help": "The prompt of the image to be generated."
-                  })  # "Question: how many cats are there? Answer:"
+        default=None, metadata={"help": "The prompt of the image to be generated."}
+    )  # "Question: how many cats are there? Answer:"
 
 
 @dataclass
@@ -58,64 +51,62 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="paddlemix/blip2-caption-opt2.7b",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     pretrained_model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to pre-trained model that we will use for inference."
-        }, )
+        metadata={"help": "The path to pre-trained model that we will use for inference."},
+    )
     fp16: str = field(
         default=True,
-        metadata={"help": "Export with mixed precision."}, )
+        metadata={"help": "Export with mixed precision."},
+    )
 
 
 def main():
     parser = PdArgumentParser((ModelArguments, DataArguments))
     model_args, data_args = parser.parse_args_into_dataclasses()
-    url = (data_args.input_image
-           )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    prompt = "a photo of "
-    processor = Blip2Processor.from_pretrained(model_args.model_name_or_path)
-    model = Blip2ForConditionalGeneration.from_pretrained(
-        model_args.model_name_or_path)
+    # url = data_args.input_image  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # image = Image.open(requests.get(url, stream=True).raw)
+
+    # prompt = "a photo of "
+    # processor = Blip2Processor.from_pretrained(model_args.model_name_or_path)
+    model = Blip2ForConditionalGeneration.from_pretrained(model_args.model_name_or_path)
     model.eval()
     dtype = "float32"
     if model_args.fp16:
         decorated = paddle.amp.decorate(
-            models=[model.visual_encoder, model.language_model],
-            optimizers=None,
-            level="O2")
+            models=[model.visual_encoder, model.language_model], optimizers=None, level="O2"
+        )
         model.visual_encoder, model.language_model = decorated
         dtype = "float16"
 
     shape1 = [None, 3, None, None]
-    input_spec = [paddle.static.InputSpec(shape=shape1, dtype='float32'), ]
-    image_encoder = paddle.jit.to_static(
-        model.encode_image, input_spec=input_spec)
+    input_spec = [
+        paddle.static.InputSpec(shape=shape1, dtype="float32"),
+    ]
+    image_encoder = paddle.jit.to_static(model.encode_image, input_spec=input_spec)
     save_path = "blip2_export"
-    paddle.jit.save(image_encoder, os.path.join(save_path, 'image_encoder'))
+    paddle.jit.save(image_encoder, os.path.join(save_path, "image_encoder"))
 
     # TODO add test config
     deploy_info = {
-        'Deploy': {
-            'model': 'image_encoder.pdmodel',
-            'params': 'image_encoder.pdiparams',
-            'input_img_shape': shape1,
-            'output_dtype': dtype
+        "Deploy": {
+            "model": "image_encoder.pdmodel",
+            "params": "image_encoder.pdiparams",
+            "input_img_shape": shape1,
+            "output_dtype": dtype,
         }
     }
-    msg = '\n---------------Deploy Information---------------\n'
+    msg = "\n---------------Deploy Information---------------\n"
     msg += str(yaml.dump(deploy_info))
     logger.info(msg)
 
-    yml_file = os.path.join(save_path, 'deploy.yaml')
-    with open(yml_file, 'w') as file:
+    yml_file = os.path.join(save_path, "deploy.yaml")
+    with open(yml_file, "w") as file:
         yaml.dump(deploy_info, file)
 
-    logger.info(f'The inference model is saved in {save_path}')
+    logger.info(f"The inference model is saved in {save_path}")
 
 
 if __name__ == "__main__":
diff --git a/paddlemix/examples/blip2/merge_weight.py b/paddlemix/examples/blip2/merge_weight.py
new file mode 100644
index 0000000000000..ae7adabe81dff
--- /dev/null
+++ b/paddlemix/examples/blip2/merge_weight.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from paddlemix.utils.log import logger
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["FLAGS_use_cuda_managed_memory"] = "true"
+
+import paddle
+import torch
+from paddlenlp.transformers import LlamaForCausalLM
+from paddlenlp.transformers.opt.modeling import OPTForCausalLM
+
+
+def merge(args):
+    model_dict = {}
+    # load the first item: vision_model
+    state_dict = paddle.load(args.blip2_path)
+    for n, p in state_dict.items():
+        if n.startswith("vision_model") or n.startswith("qformer") or n == "query_tokens":
+            model_dict[n] = p
+    logger.info("[1/3] load ViT, qformer and query_tokens done!")
+
+    # load the second item: llm model
+    if "opt" in args.llm_name:
+        llm_model = OPTForCausalLM.from_pretrained(args.llm_path)
+    elif "llama" in args.llm_name:
+        llm_model = LlamaForCausalLM.from_pretrained(args.llm_path)
+    else:
+        ValueError(f"The LLM model {args.llm_name} is not supported.")
+
+    for n, p in llm_model.named_parameters():
+        new_name = "language_model." + n
+        model_dict[new_name] = p
+    logger.info("[2/3] load language_model done!")
+
+    # load the third item: blip2
+    llm_state_dict = torch.load(args.llm_path)
+    for n, p in llm_state_dict["model"].items():
+        if n.startswith(args.llm_name + "_model.model"):
+            new_name = n.replace(args.llm_name + "_model.model", "language_model." + args.llm_name)
+            new_p = paddle.to_tensor(p.cpu().numpy())
+            model_dict[new_name] = new_p
+
+        if n.startswith(args.llm_name + args.llm_name + "_proj"):
+            new_name = n.replace(args.llm_name + "_proj", "language_projection")
+            if n.endswith("weight"):
+                new_p = paddle.to_tensor(p.cpu().numpy()).transpose([1, 0])
+            else:
+                new_p = paddle.to_tensor(p.cpu().numpy())
+            model_dict[new_name] = new_p
+
+    logger.info("[3/3] load language_projection, some llm weights from blip2 done!")
+
+    save_path = os.path.join(args.save_path, "model_state.pdparams")
+    paddle.save(model_dict, save_path)
+    logger.info("The checkpoint of blip2 has been saved to :{}".format(save_path))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--blip2_path",
+        default="/blip2/dirname",
+        type=str,
+        help="The dir name of blip2-flan-t5-xxl.",
+    )
+    parser.add_argument("--llm_name", default="opt", type=str, help="Thename of llm model.")
+    parser.add_argument(
+        "--llm_path",
+        default="/llm/dirname",
+        type=str,
+        help="The dir name of llm model.",
+    )
+    parser.add_argument(
+        "--blip2_path",
+        default="/blip2/prerained_blip2.pth",
+        type=str,
+        help="The checkpoint path of blip2.",
+    )
+    parser.add_argument(
+        "--save_path",
+        default="/save/to/dirname",
+        type=str,
+        help="The saving path of blip2.",
+    )
+    args = parser.parse_args()
+
+    args.blip2_path = os.path.join(args.blip2_path, "model_state.pdparams")
+    if not os.path.exists(args.blip2_path):
+        raise ValueError("Not found the file: {}".format(args.blip2_path))
+    if not os.path.isdir(args.llm_path):
+        raise ValueError("It is not a directory: {}".format(args.llm_path))
+    if not os.path.exists(args.llm_path):
+        raise ValueError("Not found the file: {}".format(args.llm_path))
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+
+    merge(args)
diff --git a/paddlemix/examples/blip2/run_eval_caption.py b/paddlemix/examples/blip2/run_eval_caption.py
index 91443a8a4fc53..85ed6959f0e8d 100644
--- a/paddlemix/examples/blip2/run_eval_caption.py
+++ b/paddlemix/examples/blip2/run_eval_caption.py
@@ -12,31 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import os
-sys.path.insert(
-    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
-import paddle.distributed as dist
-from paddle.distributed import fleet
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../.."))
+import random
 from dataclasses import dataclass, field
+
 import numpy as np
-import random
 import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from sklearn.utils import compute_sample_weight
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
-from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
+from paddlenlp.transformers import AutoTokenizer
+
 from paddlemix.datasets import load_dataset
-from paddlemix.models.blip2.configuration import (
-    Blip2Config, Blip2QFormerConfig, Blip2VisionConfig)
+from paddlemix.examples.blip2.utils import BlipCollator
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
-from paddlemix.processors.blip_processing import Blip2Processor
+from paddlemix.processors.blip_processing import (
+    Blip2Processor,
+    BlipImageProcessor,
+    BlipTextProcessor,
+)
 from paddlemix.trainer.blip2_trainer import BLIP2Trainer as Trainer
 from paddlemix.utils.log import logger
-from paddlenlp.transformers import AutoTokenizer
-from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor
-from paddlemix.examples.blip2.utils import BlipCollator
 
 
 @dataclass
@@ -50,13 +50,11 @@ class DataArguments:
 
     task_name: str = field(
         default="coco_caption",
-        metadata={
-            "help": "The name of the task to use (via the datasets library)."
-        }, )
+        metadata={"help": "The name of the task to use (via the datasets library)."},
+    )
     prompt: str = field(
-        default="a photo of ",
-        metadata={"help": "The prompt of the image to be generated."
-                  })  # "Question: how many cats are there? Answer:"
+        default="a photo of ", metadata={"help": "The prompt of the image to be generated."}
+    )  # "Question: how many cats are there? Answer:"
 
 
 @dataclass
@@ -67,11 +65,13 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="paddlemix/blip2-caption-opt2.7b",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
 
     text_model_name_or_path: str = field(
         default="facebook/opt-2.7b",
-        metadata={"help": "The type of text model to use (OPT, T5)."}, )
+        metadata={"help": "The type of text model to use (OPT, T5)."},
+    )
 
 
 @dataclass
@@ -79,99 +79,63 @@ class PreTrainingArguments(TrainingArguments):
     """
     Arguments pertaining to what training options we are going to use during pretraining.
     """
-    weight_decay: float = field(
-        default=0.05, metadata={"help": "Weight decay if we apply some."})
-    learning_rate: float = field(
-        default=0.0001, metadata={"help": "The initial learning rate."})
-    num_train_epochs: float = field(
-        default=10.0,
-        metadata={"help": "Total number of training epochs to perform."})
-    warmup_start_lr: float = field(
-        default=1e-6, metadata={"help": "Initial learning rate of warm up."})
-    eta_min: float = field(
-        default=1e-5, metadata={"help": "The minimum value of learning rate."})
-    warmup_steps: int = field(
-        default=2000, metadata={"help": "Number of warmup steps."})
-    lr_scheduler_name: str = field(
-        default="CosineDecayWithWarmup",
-        metadata={"help": "The scheduler name to use."})
+
+    weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."})
+    learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."})
+    num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."})
+    eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."})
+    warmup_steps: int = field(default=2000, metadata={"help": "Number of warmup steps."})
+    lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."})
     per_device_train_batch_size: int = field(
-        default=128,
-        metadata={
-            "help": "Batch size per GPU core/CPU for training. (default: 8)"
-        })
+        default=128, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"}
+    )
     per_device_eval_batch_size: int = field(
-        default=1,
-        metadata={
-            "help": " Batch size per GPU core/CPU for evaluation. (default:8)"
-        })
-    warmup_start_lr: float = field(
-        default=1e-6,
-        metadata={"help": " The initial learning rate of blip2."})
+        default=1, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"}
+    )
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": " The initial learning rate of blip2."})
     output_dir: str = field(default=".", metadata={"help": "The output path"})
-    do_eval: bool = field(
-        default=True, metadata={"help": "Whether to evaluation."})
+    do_eval: bool = field(default=True, metadata={"help": "Whether to evaluation."})
     do_train: bool = field(default=True, metadata={"help": "Whether to train."})
 
-    logging_steps: int = field(
-        default=50, metadata={"help": "Logging interval"})
-    evaluation_strategy: str = field(
-        default="no",
-        metadata={"help": "Evaluation strategy (epoch/steps/no)"})
+    logging_steps: int = field(default=50, metadata={"help": "Logging interval"})
+    evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"})
 
-    fp16_opt_level: str = field(
-        default="O1", metadata={"help": "Mixed Precision Type"})
-    fp16: bool = field(
-        default=True, metadata={"help": "Whether to use mixed Precision"})
+    fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"})
+    fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"})
     gradient_checkpointing: bool = field(
-        default=False,
-        metadata={"help": "Forward recompute for saving graphics memory"})
-    tensor_parallel_degree: int = field(
-        default=1,
-        metadata={"help": "Set the number of tensor model parallel"})
+        default=False, metadata={"help": "Forward recompute for saving graphics memory"}
+    )
+    tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"})
     sharding_parallel_degree: int = field(
-        default=1,
-        metadata={
-            "help": "Set the number of sharding, enable sharding parallel"
-        })
-    pipeline_parallel_degree: int = field(
-        default=1, metadata={"help": "Enable pipeline parallel"})
-    fp16_opt_level: str = field(
-        default="O1", metadata={"help": "Mixed Precision Type"})
-    fp16: bool = field(
-        default=True, metadata={"help": "Whether to use mixed Precision"})
+        default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}
+    )
+    pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"})
+    fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"})
+    fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"})
     gradient_checkpointing: bool = field(
-        default=False,
-        metadata={"help": "Forward recompute for saving graphics memory"})
-    tensor_parallel_degree: int = field(
-        default=1,
-        metadata={"help": "Set the number of tensor model parallel"})
+        default=False, metadata={"help": "Forward recompute for saving graphics memory"}
+    )
+    tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"})
     sharding_parallel_degree: int = field(
-        default=1,
-        metadata={
-            "help": "Set the number of sharding, enable sharding parallel"
-        })
-    pipeline_parallel_degree: int = field(
-        default=1, metadata={"help": "Enable pipeline parallel"})
+        default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}
+    )
+    pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"})
     model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to model if you want to load weights from the specified path"
-        }, )
+        metadata={"help": "The path to model if you want to load weights from the specified path"},
+    )
 
 
 def create_model(config):
     # blip2_config = Blip2ForConditionalGeneration(onfig.model_name_or_path)
-    model = Blip2ForConditionalGeneration.from_pretrained(
-        pretrained_model_name_or_path=config.model_name_or_path)
+    model = Blip2ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path=config.model_name_or_path)
     paddle.device.cuda.empty_cache()
     return model
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
     # Log model and data config
@@ -186,14 +150,12 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        +
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
     )
 
     # Detecting last checkpoint
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
@@ -202,20 +164,21 @@ def main():
             )
 
     # create dataset
-    tokenizer_class = AutoTokenizer.from_pretrained(
-        model_args.text_model_name_or_path, use_fast=False)
+    tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False)
     image_processor = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "train"))
+        os.path.join(model_args.model_name_or_path, "processor", "train")
+    )
     text_processor_class = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "train"))
-    processor = Blip2Processor(image_processor, text_processor_class,
-                               tokenizer_class)
+        os.path.join(model_args.model_name_or_path, "processor", "train")
+    )
+    processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class)
     image_processor_eval = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "eval"))
+        os.path.join(model_args.model_name_or_path, "processor", "eval")
+    )
     text_processor_class_eval = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "eval"))
-    eval_processor = Blip2Processor(image_processor_eval,
-                                    text_processor_class_eval, tokenizer_class)
+        os.path.join(model_args.model_name_or_path, "processor", "eval")
+    )
+    eval_processor = Blip2Processor(image_processor_eval, text_processor_class_eval, tokenizer_class)
 
     train_dataset = load_dataset(data_args.task_name, splits="train")
     eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")}
@@ -225,8 +188,7 @@ def main():
     model_args.mp_degree = training_args.tensor_parallel_degree
     model_args.gradient_checkpointing = training_args.gradient_checkpointing
     model = create_model(model_args)
-    logger.info("training_args.use_hybrid_parallel:{}".format(
-        training_args.use_hybrid_parallel))
+    logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel))
     # create trainer
     trainer = Trainer(
         model=model,
@@ -237,7 +199,8 @@ def main():
         eval_collator=blip_eval_collator,
         processor=processor,
         eval_processor=eval_processor,
-        tokenizer=tokenizer_class)
+        tokenizer=tokenizer_class,
+    )
     eval_metrics = trainer.evaluate(eval_dataset)
     trainer.log_metrics("eval", eval_metrics)
 
@@ -245,17 +208,15 @@ def main():
 def setdistenv(args):
     if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1:
         args.use_hybrid_parallel = True
-    args.dp_degree = dist.get_world_size() \
-                   // (args.tensor_parallel_degree \
-                    * args.sharding_parallel_degree * \
-                     args.pipeline_parallel_degree)
+    args.dp_degree = dist.get_world_size() // (
+        args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree
+    )
     strategy = fleet.DistributedStrategy()
     if args.tensor_parallel_degree > 1:
         strategy.tensor_parallel = True
     args.data_parallel_degree = args.dp_degree
     logger.info("args.dp_degree:{}".format(args.dp_degree))
-    logger.info("args.sharding_parallel_degree):{}".format(
-        args.sharding_parallel_degree))
+    logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree))
     # breakpoint()
     strategy.hybrid_configs = {
         "dp_degree": args.dp_degree,
@@ -267,7 +228,7 @@ def setdistenv(args):
     MICRO_BATCH_SIZE = 32
     strategy.pipeline_configs = {
         "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE,
-        "micro_batch_size": MICRO_BATCH_SIZE
+        "micro_batch_size": MICRO_BATCH_SIZE,
     }
     strategy.find_unused_parameters = True
 
@@ -287,8 +248,7 @@ def setdistenv(args):
     args.sharding_rank = hcg.get_sharding_parallel_rank()
 
     args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank
-    args.data_world_size = dist.get_world_size() // abs(
-        args.tensor_parallel_degree * args.pipeline_parallel_degree)
+    args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree)
 
     # seed control in hybrid parallel
     set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank)
@@ -296,12 +256,12 @@ def setdistenv(args):
 
 def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0):
     device_id = paddle.device.get_device()
-    assert 'gpu' in device_id
+    assert "gpu" in device_id
 
     random.seed(basic_seed + data_world_rank)
     np.random.seed(basic_seed + data_world_rank)
     paddle.seed(basic_seed + data_world_rank)
-    #TODO add manual_seed
+    # TODO add manual_seed
     # local_seed/ global_seed is used to control dropout in ModelParallel
     local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank
     global_seed = 2048 + basic_seed + data_world_rank
diff --git a/paddlemix/examples/blip2/run_predict.py b/paddlemix/examples/blip2/run_predict.py
index 29a24d402df11..50c870cedfdf7 100644
--- a/paddlemix/examples/blip2/run_predict.py
+++ b/paddlemix/examples/blip2/run_predict.py
@@ -11,30 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 import os
-import paddle.distributed as dist
-from paddle.distributed import fleet
+import random
+import sys
 from dataclasses import dataclass, field
+
 import numpy as np
-import random
 import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-sys.path.insert(
-    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
-from dataclasses import dataclass, field
-import paddle
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../.."))
+
 import requests
-from paddlenlp.trainer import PdArgumentParser
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments
+from paddlenlp.transformers import AutoTokenizer
 from PIL import Image
 
+from paddlemix.examples.blip2.utils import LLM_LIST, load_model
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
-from paddlemix.processors.blip_processing import Blip2Processor
+from paddlemix.processors.blip_processing import (
+    Blip2Processor,
+    BlipImageProcessor,
+    BlipTextProcessor,
+)
 from paddlemix.utils.log import logger
-from paddlenlp.transformers import AutoTokenizer
-from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor
-from paddlemix.examples.blip2.utils import load_model, LLM_LIST
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments)
 
 
 @dataclass
@@ -47,13 +49,11 @@ class DataArguments:
     """
 
     input_image: str = field(
-        default="http://images.cocodataset.org/val2017/000000039769.jpg",
-        metadata={"help": "The name of input image."
-                  })  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+        default="http://images.cocodataset.org/val2017/000000039769.jpg", metadata={"help": "The name of input image."}
+    )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
     prompt: str = field(
-        default="describe the image",
-        metadata={"help": "The prompt of the image to be generated."
-                  })  # "Question: how many cats are there? Answer:"
+        default="describe the image", metadata={"help": "The prompt of the image to be generated."}
+    )  # "Question: how many cats are there? Answer:"
 
 
 @dataclass
@@ -64,14 +64,14 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="paddlemix/blip2-caption-opt2.7b",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
 
     text_model_name_or_path: str = field(
         default="facebook/opt-2.7b",
-        metadata={"help": "The type of text model to use (OPT, T5)."}, )
-    image_size: int = field(
-        default=224,
-        metadata={"help": " Image size for training. (default:224)"})
+        metadata={"help": "The type of text model to use (OPT, T5)."},
+    )
+    image_size: int = field(default=224, metadata={"help": " Image size for training. (default:224)"})
 
 
 @dataclass
@@ -79,84 +79,54 @@ class PreTrainingArguments(TrainingArguments):
     """
     Arguments pertaining to what training options we are going to use during pretraining.
     """
-    weight_decay: float = field(
-        default=0.05, metadata={"help": "Weight decay if we apply some."})
-    learning_rate: float = field(
-        default=0.0001, metadata={"help": "The initial learning rate."})
-    num_train_epochs: float = field(
-        default=10.0,
-        metadata={"help": "Total number of training epochs to perform."})
-    warmup_start_lr: float = field(
-        default=1e-6, metadata={"help": "Initial learning rate of warm up."})
-    eta_min: float = field(
-        default=1e-5, metadata={"help": "The minimum value of learning rate."})
-    warmup_steps: int = field(
-        default=2000, metadata={"help": "Number of warmup steps."})
-    lr_scheduler_name: str = field(
-        default="CosineDecayWithWarmup",
-        metadata={"help": "The scheduler name to use."})
+
+    weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."})
+    learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."})
+    num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."})
+    eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."})
+    warmup_steps: int = field(default=2000, metadata={"help": "Number of warmup steps."})
+    lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."})
     per_device_train_batch_size: int = field(
-        default=128,
-        metadata={
-            "help": "Batch size per GPU core/CPU for training. (default: 8)"
-        })
+        default=128, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"}
+    )
     per_device_eval_batch_size: int = field(
-        default=128,
-        metadata={
-            "help": " Batch size per GPU core/CPU for evaluation. (default:8)"
-        })
-    warmup_start_lr: float = field(
-        default=1e-6,
-        metadata={"help": " The initial learning rate of blip2."})
+        default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"}
+    )
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": " The initial learning rate of blip2."})
     output_dir: str = field(default=".", metadata={"help": "The output path"})
-    do_eval: bool = field(
-        default=False, metadata={"help": "Whether to evaluation."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to evaluation."})
     do_train: bool = field(default=True, metadata={"help": "Whether to train."})
 
-    logging_steps: int = field(
-        default=50, metadata={"help": "Logging interval"})
-    evaluation_strategy: str = field(
-        default="no",
-        metadata={"help": "Evaluation strategy (epoch/steps/no)"})
+    logging_steps: int = field(default=50, metadata={"help": "Logging interval"})
+    evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"})
 
-    fp16_opt_level: str = field(
-        default="O1", metadata={"help": "Mixed Precision Type"})
-    fp16: bool = field(
-        default=True, metadata={"help": "Whether to use mixed Precision"})
+    fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"})
+    fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"})
     gradient_checkpointing: bool = field(
-        default=False,
-        metadata={"help": "Forward recompute for saving graphics memory"})
-    tensor_parallel_degree: int = field(
-        default=1,
-        metadata={"help": "Set the number of tensor model parallel"})
+        default=False, metadata={"help": "Forward recompute for saving graphics memory"}
+    )
+    tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"})
     sharding_parallel_degree: int = field(
-        default=1,
-        metadata={
-            "help": "Set the number of sharding, enable sharding parallel"
-        })
-    pipeline_parallel_degree: int = field(
-        default=1, metadata={"help": "Enable pipeline parallel"})
+        default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}
+    )
+    pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"})
     model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to model if you want to load weights from the specified path"
-        }, )
+        metadata={"help": "The path to model if you want to load weights from the specified path"},
+    )
 
 
 def create_model(config):
-    model = Blip2ForConditionalGeneration.from_pretrained(
-        pretrained_model_name_or_path=config.model_name_or_path)
+    model = Blip2ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path=config.model_name_or_path)
     paddle.device.cuda.empty_cache()
     return model
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    url = (data_args.input_image
-           )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+    url = data_args.input_image  # "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw)
     training_args.print_config(model_args, "Model")
     training_args.print_config(data_args, "Data")
@@ -167,36 +137,29 @@ def main():
     model_args.data_world_size = training_args.data_world_size
     paddle.set_device(training_args.device)
     prompt = data_args.prompt
-    tokenizer_class = AutoTokenizer.from_pretrained(
-        model_args.text_model_name_or_path, use_fast=False)
+    tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False)
     image_processor = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "eval"))
+        os.path.join(model_args.model_name_or_path, "processor", "eval")
+    )
     text_processor_class = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "eval"))
-    processor = Blip2Processor(image_processor, text_processor_class,
-                               tokenizer_class)
+        os.path.join(model_args.model_name_or_path, "processor", "eval")
+    )
+    processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class)
     inputs = processor(
         images=image,
         text=prompt,
         return_tensors="pd",
         return_attention_mask=True,
-        mode="test", )
+        mode="test",
+    )
     model = create_model(model_args)
     model.eval()
     if training_args.model_path is not None:
         checkpoint = training_args.model_path
-        load_model(
-            training_args,
-            model,
-            ckpt_dir=checkpoint,
-            load_language_model=False)
-        load_model(
-            training_args,
-            model.language_model,
-            ckpt_dir=LLM_LIST[model_args.text_model_name_or_path])
+        load_model(training_args, model, ckpt_dir=checkpoint, load_language_model=False)
+        load_model(training_args, model.language_model, ckpt_dir=LLM_LIST[model_args.text_model_name_or_path])
     generated_ids, scores = model.generate(**inputs)
-    generated_text = processor.batch_decode(
-        generated_ids, skip_special_tokens=True)[0].strip()
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     logger.info("Generate text: {}".format(generated_text))
     return model
 
@@ -204,17 +167,15 @@ def main():
 def setdistenv(args):
     if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1:
         args.use_hybrid_parallel = True
-    args.dp_degree = dist.get_world_size() \
-                   // (args.tensor_parallel_degree \
-                    * args.sharding_parallel_degree * \
-                     args.pipeline_parallel_degree)
+    args.dp_degree = dist.get_world_size() // (
+        args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree
+    )
     strategy = fleet.DistributedStrategy()
     if args.tensor_parallel_degree > 1:
         strategy.tensor_parallel = True
     args.data_parallel_degree = args.dp_degree
     logger.info("args.dp_degree:{}".format(args.dp_degree))
-    logger.info("args.sharding_parallel_degree):{}".format(
-        args.sharding_parallel_degree))
+    logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree))
     if args.sharding_parallel_degree > 1:
         args.sharding = "stage1"
     strategy.hybrid_configs = {
@@ -227,7 +188,7 @@ def setdistenv(args):
     MICRO_BATCH_SIZE = 32
     strategy.pipeline_configs = {
         "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE,
-        "micro_batch_size": MICRO_BATCH_SIZE
+        "micro_batch_size": MICRO_BATCH_SIZE,
     }
     strategy.find_unused_parameters = True
 
@@ -244,8 +205,7 @@ def setdistenv(args):
     args.sharding_rank = hcg.get_sharding_parallel_rank()
 
     args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank
-    args.data_world_size = dist.get_world_size() // abs(
-        args.tensor_parallel_degree * args.pipeline_parallel_degree)
+    args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree)
 
     # seed control in hybrid parallel
     set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank)
@@ -253,12 +213,12 @@ def setdistenv(args):
 
 def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0):
     device_id = paddle.device.get_device()
-    assert 'gpu' in device_id
+    assert "gpu" in device_id
 
     random.seed(basic_seed + data_world_rank)
     np.random.seed(basic_seed + data_world_rank)
     paddle.seed(basic_seed + data_world_rank)
-    #TODO add manual_seed
+    # TODO add manual_seed
     # local_seed/ global_seed is used to control dropout in ModelParallel
     local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank
     global_seed = 2048 + basic_seed + data_world_rank
diff --git a/paddlemix/examples/blip2/run_pretrain_stage1.py b/paddlemix/examples/blip2/run_pretrain_stage1.py
index 4b6a0847d70b3..bc0302ca7cbdf 100644
--- a/paddlemix/examples/blip2/run_pretrain_stage1.py
+++ b/paddlemix/examples/blip2/run_pretrain_stage1.py
@@ -12,31 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import os
-sys.path.insert(
-    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
-import paddle.distributed as dist
-from paddle.distributed import fleet
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../.."))
+import random
 from dataclasses import dataclass, field
+
 import numpy as np
-import random
 import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
-from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
+from paddlenlp.transformers import AutoTokenizer
+
 from paddlemix.datasets import load_dataset
-from paddlemix.models.blip2.configuration import (
-    Blip2Config, Blip2QFormerConfig, Blip2VisionConfig)
+from paddlemix.models.blip2.configuration import Blip2Config
+from paddlemix.models.blip2.eva_vit import interpolate_pos_embed
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
-from paddlemix.processors.blip_processing import Blip2Processor
+from paddlemix.processors.blip_processing import (
+    Blip2Processor,
+    BlipImageProcessor,
+    BlipTextProcessor,
+)
 from paddlemix.trainer.blip2_trainer import BLIP2Trainer as Trainer
 from paddlemix.utils.log import logger
-from paddlenlp.transformers import AutoTokenizer
-from paddlemix.models.blip2.eva_vit import interpolate_pos_embed
-from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor
-from paddlemix.examples.blip2.utils import load_model
 
 
 class BlipCollator:
@@ -58,15 +59,18 @@ def __call__(self, data_list):
             text = None
         else:
             text = [sample["text_input"] for sample in data_list]
-        image_id = [sample["image_id"] for sample in data_list]
+        # image_id = [sample["image_id"] for sample in data_list]
 
         batch = self.processor(
             images=images,
             return_tensors="pd",
-            mode=self.mode, )
+            mode=self.mode,
+        )
 
         # batch.update({'image_id':image_id},)
-        batch.update({'text_input_stage1': text}, )
+        batch.update(
+            {"text_input_stage1": text},
+        )
         return batch
 
 
@@ -81,13 +85,11 @@ class DataArguments:
 
     task_name: str = field(
         default="coco_caption",
-        metadata={
-            "help": "The name of the task to use (via the datasets library)."
-        }, )
+        metadata={"help": "The name of the task to use (via the datasets library)."},
+    )
     prompt: str = field(
-        default="a photo of ",
-        metadata={"help": "The prompt of the image to be generated."
-                  })  # "Question: how many cats are there? Answer:"
+        default="a photo of ", metadata={"help": "The prompt of the image to be generated."}
+    )  # "Question: how many cats are there? Answer:"
 
 
 @dataclass
@@ -98,14 +100,14 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="paddlemix/blip2-stage1",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
 
     text_model_name_or_path: str = field(
         default="facebook/opt-2.7b",
-        metadata={"help": "The type of text model to use (OPT, T5)."}, )
-    image_size: int = field(
-        default=224,
-        metadata={"help": " Image size for training. (default:224)"})
+        metadata={"help": "The type of text model to use (OPT, T5)."},
+    )
+    image_size: int = field(default=224, metadata={"help": " Image size for training. (default:224)"})
 
 
 @dataclass
@@ -114,68 +116,41 @@ class PreTrainingArguments(TrainingArguments):
     Arguments pertaining to what training options we are going to use during pretraining.
     """
 
-    weight_decay: float = field(
-        default=0.05, metadata={"help": "Weight decay if we apply some."})
-    learning_rate: float = field(
-        default=0.0001, metadata={"help": "The initial learning rate."})
-    num_train_epochs: float = field(
-        default=10.0,
-        metadata={"help": "Total number of training epochs to perform."})
-    warmup_start_lr: float = field(
-        default=1e-6, metadata={"help": "Initial learning rate of warm up."})
-    eta_min: float = field(
-        default=1e-5, metadata={"help": "The minimum value of learning rate."})
-    warmup_steps: int = field(
-        default=5000, metadata={"help": "Number of warmup steps."})
-    lr_scheduler_name: str = field(
-        default="CosineDecayWithWarmup",
-        metadata={"help": "The scheduler name to use."})
+    weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."})
+    learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."})
+    num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."})
+    eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."})
+    warmup_steps: int = field(default=5000, metadata={"help": "Number of warmup steps."})
+    lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."})
     per_device_train_batch_size: int = field(
-        default=256,
-        metadata={
-            "help": "Batch size per GPU core/CPU for training. (default: 8)"
-        })
+        default=256, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"}
+    )
     per_device_eval_batch_size: int = field(
-        default=128,
-        metadata={
-            "help": " Batch size per GPU core/CPU for evaluation. (default:8)"
-        })
+        default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"}
+    )
     output_dir: str = field(default=".", metadata={"help": "The output path"})
-    do_eval: bool = field(
-        default=False, metadata={"help": "Whether to evaluation."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to evaluation."})
     do_train: bool = field(default=True, metadata={"help": "Whether to train."})
 
-    logging_steps: int = field(
-        default=50, metadata={"help": "Logging interval"})
-    evaluation_strategy: str = field(
-        default="no",
-        metadata={"help": "Evaluation strategy (epoch/steps/no)"})
+    logging_steps: int = field(default=50, metadata={"help": "Logging interval"})
+    evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"})
 
-    fp16_opt_level: str = field(
-        default="O1", metadata={"help": "Mixed Precision Type"})
-    fp16: bool = field(
-        default=True, metadata={"help": "Whether to use mixed Precision"})
+    fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"})
+    fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"})
     gradient_checkpointing: bool = field(
-        default=False,
-        metadata={"help": "Forward recompute for saving graphics memory"})
-    tensor_parallel_degree: int = field(
-        default=1,
-        metadata={"help": "Set the number of tensor model parallel"})
+        default=False, metadata={"help": "Forward recompute for saving graphics memory"}
+    )
+    tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"})
     sharding_parallel_degree: int = field(
-        default=1,
-        metadata={
-            "help": "Set the number of sharding, enable sharding parallel"
-        })
-    pipeline_parallel_degree: int = field(
-        default=1, metadata={"help": "Enable pipeline parallel"})
-    checkpoint_steps: int = field(
-        default=1000, metadata={"help": "save checkpoint with x steps"})
+        default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}
+    )
+    pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"})
+    checkpoint_steps: int = field(default=1000, metadata={"help": "save checkpoint with x steps"})
     model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to model if you want to load weights from the specified path"
-        }, )
+        metadata={"help": "The path to model if you want to load weights from the specified path"},
+    )
 
 
 def create_model(config):
@@ -188,8 +163,7 @@ def create_model(config):
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
     # Log model and data config
@@ -204,14 +178,12 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        +
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
     )
 
     # Detecting last checkpoint
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
@@ -220,14 +192,14 @@ def main():
             )
 
     # create dataset
-    tokenizer_class = AutoTokenizer.from_pretrained(
-        model_args.text_model_name_or_path, use_fast=False)
+    tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False)
     image_processor = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "train"))
+        os.path.join(model_args.model_name_or_path, "processor", "train")
+    )
     text_processor_class = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "train"))
-    processor = Blip2Processor(image_processor, text_processor_class,
-                               tokenizer_class)
+        os.path.join(model_args.model_name_or_path, "processor", "train")
+    )
+    processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class)
 
     train_dataset = load_dataset(data_args.task_name, splits="train")
     eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")}
@@ -236,8 +208,7 @@ def main():
     model_args.mp_degree = training_args.tensor_parallel_degree
     model_args.gradient_checkpointing = training_args.gradient_checkpointing
     model = create_model(model_args)
-    logger.info("training_args.use_hybrid_parallel:{}".format(
-        training_args.use_hybrid_parallel))
+    logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel))
     # create trainer
     trainer = Trainer(
         model=model,
@@ -246,7 +217,8 @@ def main():
         eval_dataset=eval_dataset,
         data_collator=blip_collator,
         processor=processor,
-        tokenizer=tokenizer_class)
+        tokenizer=tokenizer_class,
+    )
     # Training
     checkpoint = None
     if training_args.resume_from_checkpoint is not None:
@@ -263,17 +235,15 @@ def main():
 def setdistenv(args):
     if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1:
         args.use_hybrid_parallel = True
-    args.dp_degree = dist.get_world_size() \
-                   // (args.tensor_parallel_degree \
-                    * args.sharding_parallel_degree * \
-                     args.pipeline_parallel_degree)
+    args.dp_degree = dist.get_world_size() // (
+        args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree
+    )
     strategy = fleet.DistributedStrategy()
     if args.tensor_parallel_degree > 1:
         strategy.tensor_parallel = True
     args.data_parallel_degree = args.dp_degree
     logger.info("args.dp_degree:{}".format(args.dp_degree))
-    logger.info("args.sharding_parallel_degree):{}".format(
-        args.sharding_parallel_degree))
+    logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree))
     if args.sharding_parallel_degree > 1:
         args.sharding = "stage1"
     strategy.hybrid_configs = {
@@ -286,7 +256,7 @@ def setdistenv(args):
     MICRO_BATCH_SIZE = 32
     strategy.pipeline_configs = {
         "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE,
-        "micro_batch_size": MICRO_BATCH_SIZE
+        "micro_batch_size": MICRO_BATCH_SIZE,
     }
     strategy.find_unused_parameters = True
 
@@ -303,8 +273,7 @@ def setdistenv(args):
     args.sharding_rank = hcg.get_sharding_parallel_rank()
 
     args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank
-    args.data_world_size = dist.get_world_size() // abs(
-        args.tensor_parallel_degree * args.pipeline_parallel_degree)
+    args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree)
 
     # seed control in hybrid parallel
     set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank)
@@ -312,12 +281,12 @@ def setdistenv(args):
 
 def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0):
     device_id = paddle.device.get_device()
-    assert 'gpu' in device_id
+    assert "gpu" in device_id
 
     random.seed(basic_seed + data_world_rank)
     np.random.seed(basic_seed + data_world_rank)
     paddle.seed(basic_seed + data_world_rank)
-    #TODO add manual_seed
+    # TODO add manual_seed
     # local_seed/ global_seed is used to control dropout in ModelParallel
     local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank
     global_seed = 2048 + basic_seed + data_world_rank
diff --git a/paddlemix/examples/blip2/run_pretrain_stage2.py b/paddlemix/examples/blip2/run_pretrain_stage2.py
index bff19a553dc73..e34cc7e229d0e 100644
--- a/paddlemix/examples/blip2/run_pretrain_stage2.py
+++ b/paddlemix/examples/blip2/run_pretrain_stage2.py
@@ -12,31 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import os
-sys.path.insert(
-    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
-import paddle.distributed as dist
-from paddle.distributed import fleet
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../.."))
+import random
 from dataclasses import dataclass, field
+
 import numpy as np
-import random
 import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
-from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
+from paddlenlp.transformers import AutoTokenizer
+
 from paddlemix.datasets import load_dataset
-from paddlemix.models.blip2.configuration import (
-    Blip2Config, Blip2QFormerConfig, Blip2VisionConfig)
+from paddlemix.examples.blip2.utils import LLM_LIST, BlipCollator, load_model
+from paddlemix.models.blip2.configuration import Blip2Config
 from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
-from paddlemix.processors.blip_processing import Blip2Processor
+from paddlemix.processors.blip_processing import (
+    Blip2Processor,
+    BlipImageProcessor,
+    BlipTextProcessor,
+)
 from paddlemix.trainer.blip2_trainer import BLIP2Trainer as Trainer
 from paddlemix.utils.log import logger
-from paddlenlp.transformers import AutoTokenizer
-from paddlemix.models.blip2.eva_vit import interpolate_pos_embed
-from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor
-from paddlemix.examples.blip2.utils import BlipCollator, LLM_LIST, load_model
 
 
 @dataclass
@@ -50,13 +51,11 @@ class DataArguments:
 
     task_name: str = field(
         default="coco_caption",
-        metadata={
-            "help": "The name of the task to use (via the datasets library)."
-        }, )
+        metadata={"help": "The name of the task to use (via the datasets library)."},
+    )
     prompt: str = field(
-        default="a photo of ",
-        metadata={"help": "The prompt of the image to be generated."
-                  })  # "Question: how many cats are there? Answer:"
+        default="a photo of ", metadata={"help": "The prompt of the image to be generated."}
+    )  # "Question: how many cats are there? Answer:"
 
 
 @dataclass
@@ -67,17 +66,15 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="paddlemix/blip2-stage2",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
 
     text_model_name_or_path: str = field(
         default="facebook/opt-2.7b",
-        metadata={"help": "The type of text model to use (OPT, T5)."}, )
-    image_size: int = field(
-        default=224,
-        metadata={"help": " Image size for training. (default:224)"})
-    llm_name: str = field(
-        default="opt-2.7b",
-        metadata={"help": "llm name which you ned to load in LLM_LIST"})
+        metadata={"help": "The type of text model to use (OPT, T5)."},
+    )
+    image_size: int = field(default=224, metadata={"help": " Image size for training. (default:224)"})
+    llm_name: str = field(default="opt-2.7b", metadata={"help": "llm name which you ned to load in LLM_LIST"})
 
 
 @dataclass
@@ -85,75 +82,46 @@ class PreTrainingArguments(TrainingArguments):
     """
     Arguments pertaining to what training options we are going to use during pretraining.
     """
-    weight_decay: float = field(
-        default=0.05, metadata={"help": "Weight decay if we apply some."})
-    learning_rate: float = field(
-        default=0.0001, metadata={"help": "The initial learning rate."})
-    num_train_epochs: float = field(
-        default=10.0,
-        metadata={"help": "Total number of training epochs to perform."})
-    warmup_start_lr: float = field(
-        default=1e-6, metadata={"help": "Initial learning rate of warm up."})
-    eta_min: float = field(
-        default=1e-5, metadata={"help": "The minimum value of learning rate."})
-    warmup_steps: int = field(
-        default=2000, metadata={"help": "Number of warmup steps."})
-    lr_scheduler_name: str = field(
-        default="CosineDecayWithWarmup",
-        metadata={"help": "The scheduler name to use."})
+
+    weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."})
+    learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."})
+    num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."})
+    eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."})
+    warmup_steps: int = field(default=2000, metadata={"help": "Number of warmup steps."})
+    lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."})
     per_device_train_batch_size: int = field(
-        default=32,
-        metadata={
-            "help": "Batch size per GPU core/CPU for training. (default: 8)"
-        })
+        default=32, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"}
+    )
     per_device_eval_batch_size: int = field(
-        default=128,
-        metadata={
-            "help": " Batch size per GPU core/CPU for evaluation. (default:8)"
-        })
-    warmup_start_lr: float = field(
-        default=1e-6,
-        metadata={"help": " The initial learning rate of blip2."})
+        default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"}
+    )
+    warmup_start_lr: float = field(default=1e-6, metadata={"help": " The initial learning rate of blip2."})
     output_dir: str = field(default=".", metadata={"help": "The output path"})
-    do_eval: bool = field(
-        default=False, metadata={"help": "Whether to evaluation."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to evaluation."})
     do_train: bool = field(default=True, metadata={"help": "Whether to train."})
 
-    logging_steps: int = field(
-        default=50, metadata={"help": "Logging interval"})
-    evaluation_strategy: str = field(
-        default="no",
-        metadata={"help": "Evaluation strategy (epoch/steps/no)"})
+    logging_steps: int = field(default=50, metadata={"help": "Logging interval"})
+    evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"})
 
-    fp16_opt_level: str = field(
-        default="O1", metadata={"help": "Mixed Precision Type"})
-    fp16: bool = field(
-        default=True, metadata={"help": "Whether to use mixed Precision"})
+    fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"})
+    fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"})
     gradient_checkpointing: bool = field(
-        default=False,
-        metadata={"help": "Forward recompute for saving graphics memory"})
-    tensor_parallel_degree: int = field(
-        default=1,
-        metadata={"help": "Set the number of tensor model parallel"})
+        default=False, metadata={"help": "Forward recompute for saving graphics memory"}
+    )
+    tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"})
     sharding_parallel_degree: int = field(
-        default=1,
-        metadata={
-            "help": "Set the number of sharding, enable sharding parallel"
-        })
-    pipeline_parallel_degree: int = field(
-        default=1, metadata={"help": "Enable pipeline parallel"})
+        default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}
+    )
+    pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"})
     resume_from_checkpoint: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to a folder with a valid checkpoint for your model."
-        }, )
+        metadata={"help": "The path to a folder with a valid checkpoint for your model."},
+    )
     model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to model if you want to load weights from the specified path"
-        }, )
+        metadata={"help": "The path to model if you want to load weights from the specified path"},
+    )
 
 
 def create_model(config):
@@ -166,8 +134,7 @@ def create_model(config):
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
     # Log model and data config
@@ -182,14 +149,12 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        +
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}"
     )
 
     # Detecting last checkpoint
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
@@ -199,20 +164,21 @@ def main():
 
     # create dataset
 
-    tokenizer_class = AutoTokenizer.from_pretrained(
-        model_args.text_model_name_or_path, use_fast=False)
+    tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False)
     image_processor = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "train"))
+        os.path.join(model_args.model_name_or_path, "processor", "train")
+    )
     text_processor_class = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "train"))
-    processor = Blip2Processor(image_processor, text_processor_class,
-                               tokenizer_class)
+        os.path.join(model_args.model_name_or_path, "processor", "train")
+    )
+    processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class)
     image_processor_eval = BlipImageProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "eval"))
+        os.path.join(model_args.model_name_or_path, "processor", "eval")
+    )
     text_processor_class_eval = BlipTextProcessor.from_pretrained(
-        os.path.join(model_args.model_name_or_path, "processor", "eval"))
-    eval_processor = Blip2Processor(image_processor_eval,
-                                    text_processor_class_eval, tokenizer_class)
+        os.path.join(model_args.model_name_or_path, "processor", "eval")
+    )
+    eval_processor = Blip2Processor(image_processor_eval, text_processor_class_eval, tokenizer_class)
 
     train_dataset = load_dataset(data_args.task_name, splits="train")
     eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")}
@@ -223,8 +189,7 @@ def main():
     model_args.gradient_checkpointing = training_args.gradient_checkpointing
     model = create_model(model_args)
 
-    logger.info("training_args.use_hybrid_parallel:{}".format(
-        training_args.use_hybrid_parallel))
+    logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel))
     trainer = Trainer(
         model=model,
         args=training_args,
@@ -234,32 +199,18 @@ def main():
         eval_collator=blip_eval_collator,
         processor=processor,
         eval_processor=eval_processor,
-        tokenizer=tokenizer_class)
+        tokenizer=tokenizer_class,
+    )
     # Training
     checkpoint = None
     if training_args.model_path is not None:
         checkpoint = training_args.model_path
-        load_model(
-            training_args,
-            model,
-            ckpt_dir=model_args.model_path,
-            load_language_model=False)
-        load_model(
-            training_args,
-            model.language_model,
-            ckpt_dir=LLM_LIST[model_args.text_model_name_or_path])
+        load_model(training_args, model, ckpt_dir=model_args.model_path, load_language_model=False)
+        load_model(training_args, model.language_model, ckpt_dir=LLM_LIST[model_args.text_model_name_or_path])
     if training_args.resume_from_checkpoint is not None:
-        checkpoint = os.path.join(training_args.resume_from_checkpoint,
-                                  "model_state.pdparams")
-        load_model(
-            training_args,
-            model,
-            ckpt_dir=checkpoint,
-            load_language_model=False)
-        load_model(
-            training_args,
-            model.language_model,
-            ckpt_dir=LLM_LIST[model_args.text_model_name_or_path])
+        checkpoint = os.path.join(training_args.resume_from_checkpoint, "model_state.pdparams")
+        load_model(training_args, model, ckpt_dir=checkpoint, load_language_model=False)
+        load_model(training_args, model.language_model, ckpt_dir=LLM_LIST[model_args.text_model_name_or_path])
     if training_args.do_eval:
         eval_metrics = trainer.evaluate(eval_dataset)
         trainer.log_metrics("eval", eval_metrics)
@@ -272,17 +223,15 @@ def main():
 def setdistenv(args):
     if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1:
         args.use_hybrid_parallel = True
-    args.dp_degree = dist.get_world_size() \
-                   // (args.tensor_parallel_degree \
-                    * args.sharding_parallel_degree * \
-                     args.pipeline_parallel_degree)
+    args.dp_degree = dist.get_world_size() // (
+        args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree
+    )
     strategy = fleet.DistributedStrategy()
     if args.tensor_parallel_degree > 1:
         strategy.tensor_parallel = True
     args.data_parallel_degree = args.dp_degree
     logger.info("args.dp_degree:{}".format(args.dp_degree))
-    logger.info("args.sharding_parallel_degree):{}".format(
-        args.sharding_parallel_degree))
+    logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree))
     if args.sharding_parallel_degree > 1:
         args.sharding = "stage1"
     strategy.hybrid_configs = {
@@ -295,7 +244,7 @@ def setdistenv(args):
     MICRO_BATCH_SIZE = 32
     strategy.pipeline_configs = {
         "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE,
-        "micro_batch_size": MICRO_BATCH_SIZE
+        "micro_batch_size": MICRO_BATCH_SIZE,
     }
     strategy.find_unused_parameters = True
 
@@ -312,8 +261,7 @@ def setdistenv(args):
     args.sharding_rank = hcg.get_sharding_parallel_rank()
 
     args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank
-    args.data_world_size = dist.get_world_size() // abs(
-        args.tensor_parallel_degree * args.pipeline_parallel_degree)
+    args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree)
 
     # seed control in hybrid parallel
     set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank)
@@ -321,12 +269,12 @@ def setdistenv(args):
 
 def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0):
     device_id = paddle.device.get_device()
-    assert 'gpu' in device_id
+    assert "gpu" in device_id
 
     random.seed(basic_seed + data_world_rank)
     np.random.seed(basic_seed + data_world_rank)
     paddle.seed(basic_seed + data_world_rank)
-    #TODO add manual_seed
+    # TODO add manual_seed
     # local_seed/ global_seed is used to control dropout in ModelParallel
     local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank
     global_seed = 2048 + basic_seed + data_world_rank
diff --git a/paddlemix/examples/blip2/utils.py b/paddlemix/examples/blip2/utils.py
index 7a0077de656b3..c93e0dec959ef 100644
--- a/paddlemix/examples/blip2/utils.py
+++ b/paddlemix/examples/blip2/utils.py
@@ -11,47 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
+import datetime
+import json
 import os
+import re
+import sys
+import time
+
+import paddle
 from pycocoevalcap.eval import COCOEvalCap
 from pycocotools.coco import COCO
-from paddlemix.utils.downloader import get_weights_path_from_url
-from paddlemix.utils.downloader import is_url
+
 from paddlemix.models.blip2.eva_vit import interpolate_pos_embed
-import paddle
+from paddlemix.utils.downloader import get_weights_path_from_url, is_url
 from paddlemix.utils.log import logger
-import time
-import json
-import sys
-import re
-import json
-import datetime
-import copy
 
 LLM_LIST = {
-    "facebook/opt-2.7b":
-    "https://bj.bcebos.com/paddlenlp/models/community/facebook/opt-2.7b/model_state.pdparams",
-    "t5-small":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-small/model_state.pdparams",
-    "t5-base":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-base/model_state.pdparams",
-    "t5-large":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-large/model_state.pdparams",
-    "t5-3b":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-3b/model_state.pdparams",
-    "t5-11b":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-11b/model_state.pdparams",
-    "t5-v1_1-base":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-base/model_state.pdparams",
-    "t5-v1_1-large":
-    "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-large/model_state.pdparams",
-    "facebook/llama-7b":
-    "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-7b/model_state.pdparams",
-    "facebook/llama-13b":
-    "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-13b/model_state.pdparams",
-    "facebook/llama-30b":
-    "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-30b/model_state.pdparams",
-    "facebook/llama-65b":
-    "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-65b/model_state.pdparams",
+    "facebook/opt-2.7b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/opt-2.7b/model_state.pdparams",
+    "t5-small": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-small/model_state.pdparams",
+    "t5-base": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-base/model_state.pdparams",
+    "t5-large": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-large/model_state.pdparams",
+    "t5-3b": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-3b/model_state.pdparams",
+    "t5-11b": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-11b/model_state.pdparams",
+    "t5-v1_1-base": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-base/model_state.pdparams",
+    "t5-v1_1-large": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-large/model_state.pdparams",
+    "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-7b/model_state.pdparams",
+    "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-13b/model_state.pdparams",
+    "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-30b/model_state.pdparams",
+    "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-65b/model_state.pdparams",
 }
 
 
@@ -81,25 +69,24 @@ def __call__(self, data_list):
             max_length=32,
             return_tensors="pd",
             return_attention_mask=True,
-            mode=self.mode, )
-        batch.update({'image_id': image_id})
+            mode=self.mode,
+        )
+        batch.update({"image_id": image_id})
         return batch
 
 
 def coco_caption_eval(coco_gt_root, results_file, split):
-    urls = {
-        "val":
-        "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json",
-        "test":
-        "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json",
-    }
+    # urls = {
+    #     "val": "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json",
+    #     "test": "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json",
+    # }
     filenames = {
         "val": "coco_karpathy_val_gt.json",
         "test": "coco_karpathy_test_gt.json",
     }
 
-    #download_url(urls[split], coco_gt_root)
-    annotation_file = os.path.join(coco_gt_root, filenames['test'])
+    # download_url(urls[split], coco_gt_root)
+    annotation_file = os.path.join(coco_gt_root, filenames["test"])
 
     # create coco object and coco_result object
     coco = COCO(annotation_file)
@@ -115,11 +102,7 @@ def coco_caption_eval(coco_gt_root, results_file, split):
     return coco_eval
 
 
-def load_model(args,
-               model,
-               optimizer=None,
-               ckpt_dir="",
-               load_language_model=True):
+def load_model(args, model, optimizer=None, ckpt_dir="", load_language_model=True):
     """
     load the saved checkpoint file and update the state dicts of model and optimizer.
     """
@@ -140,18 +123,27 @@ def load_model(args,
     if ckpt_dir and os.path.isfile(ckpt_dir):
         # breakpoint()
         print("Try to load a whole checkpoint from %s " % ckpt_dir)
-        embedding_list = ['word_embeddings']
+        embedding_list = ["word_embeddings"]
         collinear_list = [
-            "fc1", "fc2", "qkv", "proj", "query", "key", "value", "qkv_proj",
-            "q_proj", "k_proj", "v_proj", "linear1", "linear2", "project_in",
-            "project_out"
+            "fc1",
+            "fc2",
+            "qkv",
+            "proj",
+            "query",
+            "key",
+            "value",
+            "qkv_proj",
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "linear1",
+            "linear2",
+            "project_in",
+            "project_out",
         ]
         rowlinear_list = ["out_proj"]
         all_list = collinear_list + rowlinear_list + embedding_list
-        skip_list = [
-            'visual_encoder.patch_embed.proj.weight',
-            'visual_encoder.patch_embed.proj.bias'
-        ]
+        skip_list = ["visual_encoder.patch_embed.proj.weight", "visual_encoder.patch_embed.proj.bias"]
 
         col_list = []
         row_list = []
@@ -161,10 +153,10 @@ def load_model(args,
         mp_size = args.tensor_parallel_degree
 
         def renamebias(model_dict, whole_key):
-            if 'q_bias' in whole_key:
-                key = whole_key.replace('q_bias', 'q_proj.bias')
-            elif 'v_bias' in whole_key:
-                key = whole_key.replace('v_bias', 'v_proj.bias')
+            if "q_bias" in whole_key:
+                key = whole_key.replace("q_bias", "q_proj.bias")
+            elif "v_bias" in whole_key:
+                key = whole_key.replace("v_bias", "v_proj.bias")
             model_dict[key] = model_dict[whole_key]
             del model_dict[whole_key]
             return model_dict
@@ -172,47 +164,44 @@ def renamebias(model_dict, whole_key):
         def col_split_modeldict(model_dict):
             if len(model_dict.shape) == 2:
                 subbatch = model_dict.shape[1] // mp_size
-                return model_dict[:, mp_rank * subbatch:(mp_rank + 1) *
-                                  subbatch]
+                return model_dict[:, mp_rank * subbatch : (mp_rank + 1) * subbatch]
             elif len(model_dict.shape) == 1:
                 subbatch = model_dict.shape[0] // mp_size
-                return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch]
+                return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
 
         def row_split_modeldict(model_dict):
             if len(model_dict.shape) == 2:
                 subbatch = model_dict.shape[0] // mp_size
-                return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch]
+                return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
             else:
                 return model_dict
 
         def emb_split_modeldict(model_dict):
             subbatch = model_dict.shape[0] // mp_size
-            return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch]
+            return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
 
         model_dict = paddle.load(ckpt_dir)
         for whole_key in model_dict.keys():
-            if not '.' in whole_key:
+            if "." not in whole_key:
                 continue
 
-            key = whole_key.split('.')[-2]
+            key = whole_key.split(".")[-2]
             if whole_key in skip_list:
                 continue
             if key in all_list:
                 if key in collinear_list:
                     col_list.append((key, model_dict[whole_key].shape))
-                    model_dict[whole_key] = col_split_modeldict(model_dict[
-                        whole_key])
+                    model_dict[whole_key] = col_split_modeldict(model_dict[whole_key])
                 elif key in rowlinear_list:
                     row_list.append((key, model_dict[whole_key].shape))
-                    model_dict[whole_key] = row_split_modeldict(model_dict[
-                        whole_key])
+                    model_dict[whole_key] = row_split_modeldict(model_dict[whole_key])
                 else:
                     emb_list.append((key, model_dict[whole_key].shape))
-                    model_dict[whole_key] = emb_split_modeldict(model_dict[
-                        whole_key])
+                    model_dict[whole_key] = emb_split_modeldict(model_dict[whole_key])
 
         param_state_dict = model_dict
         import numpy as np
+
         model_dict = model.state_dict()
         model_weight = {}
         incorrect_keys = 0
@@ -220,21 +209,19 @@ def emb_split_modeldict(model_dict):
             if key in param_state_dict.keys():
 
                 if isinstance(param_state_dict[key], np.ndarray):
-                    param_state_dict[key] = paddle.to_tensor(param_state_dict[
-                        key])
+                    param_state_dict[key] = paddle.to_tensor(param_state_dict[key])
                 if value.dtype == param_state_dict[key].dtype:
                     model_weight[key] = param_state_dict[key]
                 else:
-                    model_weight[key] = param_state_dict[key].astype(
-                        value.dtype)
+                    model_weight[key] = param_state_dict[key].astype(value.dtype)
                 if value.shape != param_state_dict[key].shape:
-                    logger.info('Unmatched key: {}'.format(key))
+                    logger.info("Unmatched key: {}".format(key))
                     print(value.shape, param_state_dict[key].shape, key)
 
             else:
-                if load_language_model == False and "language_model" in key:
+                if load_language_model is False and "language_model" in key:
                     continue
-                logger.info('Unmatched key: {}'.format(key))
+                logger.info("Unmatched key: {}".format(key))
                 incorrect_keys += 1
         interpolate_pos_embed(model, model_weight)
         model.set_state_dict(model_weight)
@@ -245,13 +232,13 @@ def emb_split_modeldict(model_dict):
         raise TypeError("`load` requires a valid value of `ckpt_dir`.")
 
 
-def save_result(result, result_dir, filename, remove_duplicate="",
-                world_size=1):
+def save_result(result, result_dir, filename, remove_duplicate="", world_size=1):
     import logging
+
     rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
-    result_file = os.path.join(result_dir,
-                               "%s_rank%d.json" % (filename, rank_id_curr_node))
-    if not os.path.exists(result_dir): os.mkdir(result_dir)
+    result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank_id_curr_node))
+    if not os.path.exists(result_dir):
+        os.mkdir(result_dir)
     json.dump(result, open(result_file, "w"))
 
     final_result_file = os.path.join(result_dir, "%s.json" % filename)
@@ -262,8 +249,7 @@ def save_result(result, result_dir, filename, remove_duplicate="",
         result = []
         # for rank in range(get_world_size()):
         for rank in range(int(os.environ.get("PADDLE_TRAINERS_NUM", 1))):
-            result_file = os.path.join(result_dir,
-                                       "%s_rank%d.json" % (filename, rank))
+            result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank))
             res = json.load(open(result_file, "r"))
             result += res
 
@@ -281,8 +267,7 @@ def save_result(result, result_dir, filename, remove_duplicate="",
     else:
         while not os.path.exists(final_result_file):
             time.sleep(0.5)
-            logging.warning("rank %d waits rank0 to merge results." %
-                            rank_id_curr_node)
+            logging.warning("rank %d waits rank0 to merge results." % rank_id_curr_node)
 
     # combine results from all processes
     return final_result_file
@@ -464,7 +449,7 @@ def __init__(self, vqa=None, vqaRes=None, n=2):
         ]
 
     def evaluate(self, quesIds=None):
-        if quesIds == None:
+        if quesIds is None:
             quesIds = [quesId for quesId in self.params["question_id"]]
         gts = {}
         res = {}
@@ -493,13 +478,8 @@ def evaluate(self, quesIds=None):
                 for ansDic in gts[quesId]["answers"]:
                     ansDic["answer"] = self.processPunctuation(ansDic["answer"])
             for gtAnsDatum in gts[quesId]["answers"]:
-                otherGTAns = [
-                    item for item in gts[quesId]["answers"]
-                    if item != gtAnsDatum
-                ]
-                matchingAns = [
-                    item for item in otherGTAns if item["answer"] == resAns
-                ]
+                otherGTAns = [item for item in gts[quesId]["answers"] if item != gtAnsDatum]
+                matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
                 acc = min(1, float(len(matchingAns)) / 3)
                 gtAcc.append(acc)
             quesType = gts[quesId]["question_type"]
@@ -525,8 +505,7 @@ def evaluate(self, quesIds=None):
     def processPunctuation(self, inText):
         outText = inText
         for p in self.punct:
-            if (p + " " in inText or " " + p in inText) or (
-                    re.search(self.commaStrip, inText) != None):
+            if (p + " " in inText or " " + p in inText) or (re.search(self.commaStrip, inText) is not None):
                 outText = outText.replace(p, "")
             else:
                 outText = outText.replace(p, " ")
@@ -549,18 +528,16 @@ def processDigitArticle(self, inText):
         return outText
 
     def setAccuracy(self, accQA, accQuesType, accAnsType):
-        self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA),
-                                         self.n)
+        self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n)
         self.accuracy["perQuestionType"] = {
             quesType: round(
-                100 * float(sum(accQuesType[quesType])) /
-                len(accQuesType[quesType]),
-                self.n, )
+                100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]),
+                self.n,
+            )
             for quesType in accQuesType
         }
         self.accuracy["perAnswerType"] = {
-            ansType: round(100 * float(sum(accAnsType[ansType])) /
-                           len(accAnsType[ansType]), self.n)
+            ansType: round(100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n)
             for ansType in accAnsType
         }
 
@@ -593,8 +570,8 @@ def updateProgress(self, progress):
             status = "Done...\r\n"
         block = int(round(barLength * progress))
         text = "\rFinshed Percent: [{0}] {1}% {2}".format(
-            "#" * block + "-" * (barLength - block),
-            int(progress * 100), status)
+            "#" * block + "-" * (barLength - block), int(progress * 100), status
+        )
         sys.stdout.write(text)
         sys.stdout.flush()
 
@@ -612,9 +589,9 @@ def __init__(self, annotation_file=None, question_file=None):
         self.qa = {}
         self.qqa = {}
         self.imgToQA = {}
-        if not annotation_file == None and not question_file == None:
+        if annotation_file is not None and question_file is not None:
             print("loading VQA annotations and questions into memory...")
-            time_t = datetime.datetime.utcnow()
+            # time_t = datetime.datetime.utcnow()
             dataset = json.load(open(annotation_file, "r"))
             questions = json.load(open(question_file, "r"))
             self.dataset = dataset
@@ -664,17 +641,13 @@ def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
         else:
             if not len(imgIds) == 0:
                 anns = sum(
-                    [
-                        self.imgToQA[imgId] for imgId in imgIds
-                        if imgId in self.imgToQA
-                    ],
-                    [], )
+                    [self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA],
+                    [],
+                )
             else:
                 anns = self.dataset["annotations"]
-            anns = (anns if len(quesTypes) == 0 else
-                    [ann for ann in anns if ann["question_type"] in quesTypes])
-            anns = (anns if len(ansTypes) == 0 else
-                    [ann for ann in anns if ann["answer_type"] in ansTypes])
+            anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann["question_type"] in quesTypes]
+            anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann["answer_type"] in ansTypes]
         ids = [ann["question_id"] for ann in anns]
         return ids
 
@@ -694,15 +667,11 @@ def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
             anns = self.dataset["annotations"]
         else:
             if not len(quesIds) == 0:
-                anns = sum([
-                    self.qa[quesId] for quesId in quesIds if quesId in self.qa
-                ], [])
+                anns = sum([self.qa[quesId] for quesId in quesIds if quesId in self.qa], [])
             else:
                 anns = self.dataset["annotations"]
-            anns = (anns if len(quesTypes) == 0 else
-                    [ann for ann in anns if ann["question_type"] in quesTypes])
-            anns = (anns if len(ansTypes) == 0 else
-                    [ann for ann in anns if ann["answer_type"] in ansTypes])
+            anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann["question_type"] in quesTypes]
+            anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann["answer_type"] in ansTypes]
         ids = [ann["image_id"] for ann in anns]
         return ids
 
@@ -742,8 +711,7 @@ def loadRes(self, resFile, quesFile):
         res.dataset["info"] = copy.deepcopy(self.questions["info"])
         res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"])
         res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"])
-        res.dataset["data_subtype"] = copy.deepcopy(self.questions[
-            "data_subtype"])
+        res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"])
         res.dataset["license"] = copy.deepcopy(self.questions["license"])
 
         print("Loading and preparing results...     ")
@@ -751,20 +719,21 @@ def loadRes(self, resFile, quesFile):
         anns = json.load(open(resFile))
         assert type(anns) == list, "results is not an array of objects"
         annsQuesIds = [ann["question_id"] for ann in anns]
-        assert set(annsQuesIds) == set(self.getQuesIds(
-        )), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file."
+        assert set(annsQuesIds) == set(
+            self.getQuesIds()
+        ), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file."
         for ann in anns:
             quesId = ann["question_id"]
             if res.dataset["task_type"] == "Multiple Choice":
-                assert (ann["answer"] in self.qqa[quesId]["multiple_choices"]
-                        ), "predicted answer is not one of the multiple choices"
+                assert (
+                    ann["answer"] in self.qqa[quesId]["multiple_choices"]
+                ), "predicted answer is not one of the multiple choices"
             qaAnn = self.qa[quesId]
             ann["image_id"] = qaAnn["image_id"]
             ann["question_type"] = qaAnn["question_type"]
             ann["answer_type"] = qaAnn["answer_type"]
-        print("DONE (t=%0.2fs)" % (
-            (datetime.datetime.utcnow() - time_t).total_seconds()))
+        print("DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds()))
 
         res.dataset["annotations"] = anns
         res.createIndex()
-        return res
\ No newline at end of file
+        return res
diff --git a/paddlemix/examples/evaclip/run_pretrain_dist.py b/paddlemix/examples/evaclip/run_pretrain_dist.py
index 82573d19ebb21..ad7967bdead5a 100644
--- a/paddlemix/examples/evaclip/run_pretrain_dist.py
+++ b/paddlemix/examples/evaclip/run_pretrain_dist.py
@@ -24,17 +24,18 @@
 from dataclasses import dataclass, field
 
 import paddle
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments
 from paddlenlp.transformers import AutoTokenizer
 
-from paddlemix.checkpoint import load_model, save
+from paddlemix.checkpoint import load_model
 from paddlemix.datasets import load_dataset
-from paddlemix.models.evaclip.eva_clip_model import EVACLIP
+from paddlemix.models.evaclip.eva_clip_model import EVACLIP, EVACLIPConfig
 from paddlemix.optimization import create_optimizer
-from paddlemix.processors import SimpleTokenizer
 from paddlemix.processors.clip_processing import (
-    CLIPImageProcessor, CLIPProcessor, CLIPTextProcessor)
+    CLIPImageProcessor,
+    CLIPProcessor,
+    CLIPTextProcessor,
+)
 from paddlemix.trainer import CLIPTrainer
 from paddlemix.utils.env import setdistenv
 
@@ -50,21 +51,23 @@ class DataArguments:
 
     task_name: str = field(
         default="coco_clip",
-        metadata={
-            "help": "The name of the task to use (via the datasets library)."
-        }, )
+        metadata={"help": "The name of the task to use (via the datasets library)."},
+    )
 
     image_size: int = field(
         default=224,
-        metadata={"help": "image size for training"}, )
+        metadata={"help": "image size for training"},
+    )
 
     train_data: str = field(
         default="",
-        metadata={"help": "The traindata list path."}, )
+        metadata={"help": "The traindata list path."},
+    )
 
     precomputed_text_emb: str = field(
         default="open_clip_vit_g_14",
-        metadata={"help": "precomputed_text_emb name."}, )
+        metadata={"help": "precomputed_text_emb name."},
+    )
 
 
 @dataclass
@@ -75,19 +78,20 @@ class ModelArguments:
 
     model: str = field(
         default="paddlemix/EVA/EVA02-CLIP-L-14",
-        metadata={
-            "help":
-            "model name to create, for example [EVA02-CLIP-B-16/coca_EVA02-B-16]"
-        }, )
+        metadata={"help": "model name to create, for example [EVA02-CLIP-B-16/coca_EVA02-B-16]"},
+    )
     model_name_or_path: str = field(
         default="clip",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     coca_caption_loss_weight: float = field(
         default=2.0,
-        metadata={"help": "coca_caption_loss_weight set, default: 2.0"}, )
+        metadata={"help": "coca_caption_loss_weight set, default: 2.0"},
+    )
     coca_contrastive_loss_weight: float = field(
         default=1.0,
-        metadata={"help": "coca_contrastive_loss_weight set, default: 1.0"}, )
+        metadata={"help": "coca_contrastive_loss_weight set, default: 1.0"},
+    )
 
 
 @dataclass
@@ -98,48 +102,41 @@ class PreTrainingArguments(TrainingArguments):
 
     pretrained_model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to pre-trained model that we will use for pretraining."
-        }, )
-    text_wd: float = field(
-        default=0.05, metadata={"help": "Weight decay for text tower"})
-    visual_wd: float = field(
-        default=0.05, metadata={"help": "Weight decay for visual tower"})
-    text_lr: float = field(
-        default=2e-5,
-        metadata={"help": "The initial learning rate of text tower."})
-    visual_lr: float = field(
-        default=2e-4,
-        metadata={"help": "The initial learning rate of visual tower."})
-    layer_decay: float = field(
-        default=1.0, metadata={"help": "The basic layer decay."})
-    text_ld: float = field(
-        default=0.75, metadata={"help": "The layer decay of text tower."})
-    visual_ld: float = field(
-        default=0.75, metadata={"help": "The layer decay of visual tower."})
+        metadata={"help": "The path to pre-trained model that we will use for pretraining."},
+    )
+    text_wd: float = field(default=0.05, metadata={"help": "Weight decay for text tower"})
+    visual_wd: float = field(default=0.05, metadata={"help": "Weight decay for visual tower"})
+    text_lr: float = field(default=2e-5, metadata={"help": "The initial learning rate of text tower."})
+    visual_lr: float = field(default=2e-4, metadata={"help": "The initial learning rate of visual tower."})
+    layer_decay: float = field(default=1.0, metadata={"help": "The basic layer decay."})
+    text_ld: float = field(default=0.75, metadata={"help": "The layer decay of text tower."})
+    visual_ld: float = field(default=0.75, metadata={"help": "The layer decay of visual tower."})
     start_epoch: int = field(
         default=0,
-        metadata={"help": " manual epoch number (useful on restarts)"}, )
+        metadata={"help": " manual epoch number (useful on restarts)"},
+    )
     context_length: int = field(
         default=77,
-        metadata={"help": " context length for text."}, )
-    optimizer: str = field(
-        default="lamb", metadata={"help": "optimizer setting, [lamb/adamw]"})
+        metadata={"help": " context length for text."},
+    )
+    optimizer: str = field(default="lamb", metadata={"help": "optimizer setting, [lamb/adamw]"})
     dp_degree: int = field(
         default=2,
-        metadata={"help": " data parallel degrees."}, )
-    last_epoch: int = field(
-        default=-1, metadata={"help": "the last epoch to resume"})
+        metadata={"help": " data parallel degrees."},
+    )
+    last_epoch: int = field(default=-1, metadata={"help": "the last epoch to resume"})
     gather_with_grad: bool = field(
         default=False,
-        metadata={"help": "Whether to use gather_with_grad in loss."}, )
+        metadata={"help": "Whether to use gather_with_grad in loss."},
+    )
     local_loss: bool = field(
         default=False,
-        metadata={"help": "Whether to use local loss in loss."}, )
+        metadata={"help": "Whether to use local loss in loss."},
+    )
     tensorboard: bool = field(
         default=False,
-        metadata={"help": "Whether to use tensorboard to record loss."}, )
+        metadata={"help": "Whether to use tensorboard to record loss."},
+    )
 
 
 class SelfTrainer(CLIPTrainer):
@@ -154,16 +151,17 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
         self.lr_scheduler = paddle.optimizer.lr.CosineAnnealingDecay(
             1.0,
             num_training_steps - self.args.warmup_steps,
-            last_epoch=self.args.last_epoch, )
+            last_epoch=self.args.last_epoch,
+        )
         if self.args.warmup_steps > 0:
             self.lr_scheduler = paddle.optimizer.lr.LinearWarmup(
                 self.lr_scheduler,
                 self.args.warmup_steps,
                 0,
                 1.0,
-                last_epoch=self.args.last_epoch, )
-        self.optimizer = create_optimizer(self.args, self.model,
-                                          self.lr_scheduler)
+                last_epoch=self.args.last_epoch,
+            )
+        self.optimizer = create_optimizer(self.args, self.model, self.lr_scheduler)
 
 
 class Collator:
@@ -187,7 +185,8 @@ def __call__(self, data_list):
             max_length=77,
             return_tensors="pd",
             return_attention_mask=False,
-            mode="train", )
+            mode="train",
+        )
         return batch
 
 
@@ -202,22 +201,22 @@ def main_worker(training_args, model_args, data_args):
         local_loss=training_args.local_loss,
         gather_with_grad=training_args.gather_with_grad,
         data_world_rank=training_args.data_world_rank,
-        data_world_size=training_args.data_world_size, )
+        data_world_size=training_args.data_world_size,
+    )
 
     training_args.model = model_args.model
-    if (training_args.pretrained_model_path and
-            training_args.pretrained_model_path != "None" and
-            training_args.resume_from_checkpoint is None):
-        load_model(
-            training_args, model, ckpt_dir=training_args.pretrained_model_path)
+    if (
+        training_args.pretrained_model_path
+        and training_args.pretrained_model_path != "None"
+        and training_args.resume_from_checkpoint is None
+    ):
+        load_model(training_args, model, ckpt_dir=training_args.pretrained_model_path)
     if training_args.bf16 and training_args.fp16_opt_level == "O2":
         paddle.set_default_dtype("float32")
 
     train_dataset = load_dataset("coco_clip", splits="train")
-    image_processor = CLIPImageProcessor.from_pretrained(
-        model_args.model_name_or_path)
-    text_processor = CLIPTextProcessor.from_pretrained(
-        model_args.model_name_or_path)
+    image_processor = CLIPImageProcessor.from_pretrained(model_args.model_name_or_path)
+    text_processor = CLIPTextProcessor.from_pretrained(model_args.model_name_or_path)
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
     processor = CLIPProcessor(image_processor, text_processor, tokenizer)
     collator = Collator(processor)
@@ -226,7 +225,8 @@ def main_worker(training_args, model_args, data_args):
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        data_collator=collator, )
+        data_collator=collator,
+    )
 
     # Training
     checkpoint = None
@@ -239,11 +239,8 @@ def main_worker(training_args, model_args, data_args):
         trainer.save_state()
 
 
-from paddlemix.models.evaclip.eva_clip_model import EVACLIP, EVACLIPConfig
-
 if __name__ == "__main__":
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     training_args.hostname = socket.gethostname()
     pprint.pprint(data_args)
diff --git a/paddlemix/examples/evaclip/run_zero_shot_eval.py b/paddlemix/examples/evaclip/run_zero_shot_eval.py
index 984495cd049fc..236b54795a4ff 100644
--- a/paddlemix/examples/evaclip/run_zero_shot_eval.py
+++ b/paddlemix/examples/evaclip/run_zero_shot_eval.py
@@ -23,10 +23,9 @@
 import socket
 from dataclasses import dataclass, field
 
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments
 
-from paddlemix.checkpoint import load_model, save
+from paddlemix.checkpoint import load_model
 from paddlemix.datasets.laion_clip import get_data
 from paddlemix.metrics.clip_zero_shot import zero_shot_eval
 from paddlemix.models.evaclip.eva_clip_model import EVACLIP
@@ -45,11 +44,13 @@ class DataArguments:
 
     classification_eval: str = field(
         default="",
-        metadata={"help": "Path to IN1K data."}, )
+        metadata={"help": "Path to IN1K data."},
+    )
 
     precomputed_text_emb: str = field(
         default="open_clip_vit_g_14",
-        metadata={"help": "precomputed_text_emb name."}, )
+        metadata={"help": "precomputed_text_emb name."},
+    )
 
 
 @dataclass
@@ -60,13 +61,12 @@ class ModelArguments:
 
     model: str = field(
         default="paddlemix/EVA/EVA02-CLIP-L-14",
-        metadata={
-            "help":
-            "model name to create, for example paddlemix/EVA/EVA02-CLIP-L-14"
-        }, )
+        metadata={"help": "model name to create, for example paddlemix/EVA/EVA02-CLIP-L-14"},
+    )
     model_name_or_path: str = field(
         default="clip",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
 
 
 @dataclass
@@ -77,13 +77,9 @@ class PreTrainingArguments(TrainingArguments):
 
     pretrained_model_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "The path to pre-trained model that we will use for pretraining."
-        }, )
-    pretrained_text_model: str = field(
-        default="openclip",
-        metadata={"help": "the model to pre-extract text feats"})
+        metadata={"help": "The path to pre-trained model that we will use for pretraining."},
+    )
+    pretrained_text_model: str = field(default="openclip", metadata={"help": "the model to pre-extract text feats"})
 
 
 def evaluate(model, dataloader_dict, args):
@@ -94,15 +90,15 @@ def evaluate(model, dataloader_dict, args):
 
 
 def main_worker(training_args, model_args, data_args):
-    model = EVACLIP.from_pretrained(
-        model_args.model, ignore_mismatched_sizes=False)
+    model = EVACLIP.from_pretrained(model_args.model, ignore_mismatched_sizes=False)
 
     training_args.model = model_args.model
-    if (training_args.pretrained_model_path and
-            training_args.pretrained_model_path != "None" and
-            training_args.resume_from_checkpoint is None):
-        load_model(
-            training_args, model, ckpt_dir=training_args.pretrained_model_path)
+    if (
+        training_args.pretrained_model_path
+        and training_args.pretrained_model_path != "None"
+        and training_args.resume_from_checkpoint is None
+    ):
+        load_model(training_args, model, ckpt_dir=training_args.pretrained_model_path)
 
     preprocess_train = image_transform(model.visual.image_size, is_train=True)
     preprocess_val = image_transform(model.visual.image_size, is_train=False)
@@ -112,8 +108,7 @@ def main_worker(training_args, model_args, data_args):
 
 
 if __name__ == "__main__":
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, PreTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     training_args.hostname = socket.gethostname()
     pprint.pprint(data_args)
diff --git a/paddlemix/examples/groundingdino/run_predict.py b/paddlemix/examples/groundingdino/run_predict.py
index bf8022aabe463..dbd7f959cfc7c 100644
--- a/paddlemix/examples/groundingdino/run_predict.py
+++ b/paddlemix/examples/groundingdino/run_predict.py
@@ -78,9 +78,7 @@ class DataArguments:
     """
 
     input_image: str = field(metadata={"help": "The name of input image."})
-    prompt: str = field(
-        default=None,
-        metadata={"help": "The prompt of the image to be generated."})
+    prompt: str = field(default=None, metadata={"help": "The prompt of the image to be generated."})
 
 
 @dataclass
@@ -91,19 +89,24 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="GroundingDino/groundingdino-swint-ogc",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
     box_threshold: float = field(
         default=0.3,
-        metadata={"help": "box threshold."}, )
+        metadata={"help": "box threshold."},
+    )
     text_threshold: float = field(
         default=0.25,
-        metadata={"help": "text threshold."}, )
+        metadata={"help": "text threshold."},
+    )
     output_dir: str = field(
         default="output",
-        metadata={"help": "output directory."}, )
+        metadata={"help": "output directory."},
+    )
     visual: bool = field(
         default=True,
-        metadata={"help": "save visual image."}, )
+        metadata={"help": "save visual image."},
+    )
 
 
 def main():
@@ -111,12 +114,10 @@ def main():
     model_args, data_args = parser.parse_args_into_dataclasses()
 
     # bulid processor
-    processor = GroudingDinoProcessor.from_pretrained(
-        model_args.model_name_or_path)
+    processor = GroudingDinoProcessor.from_pretrained(model_args.model_name_or_path)
     # bulid model
     logger.info("dino_model: {}".format(model_args.model_name_or_path))
-    dino_model = GroundingDinoModel.from_pretrained(
-        model_args.model_name_or_path)
+    dino_model = GroundingDinoModel.from_pretrained(model_args.model_name_or_path)
     dino_model.eval()
     # read image
     url = data_args.input_image
@@ -125,11 +126,9 @@ def main():
         # read image
         image_pil = Image.open(data_args.input_image).convert("RGB")
     else:
-        image_pil = Image.open(requests.get(url, stream=True).raw).convert(
-            "RGB")
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     # preprocess image text_prompt
-    image_tensor, mask, tokenized_out = processor(
-        images=image_pil, text=data_args.prompt)
+    image_tensor, mask, tokenized_out = processor(images=image_pil, text=data_args.prompt)
 
     with paddle.no_grad():
         outputs = dino_model(
@@ -137,9 +136,9 @@ def main():
             mask,
             input_ids=tokenized_out["input_ids"],
             attention_mask=tokenized_out["attention_mask"],
-            text_self_attention_masks=tokenized_out[
-                "text_self_attention_masks"],
-            position_ids=tokenized_out["position_ids"], )
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
 
     logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
     boxes = outputs["pred_boxes"][0]  # (nq, 4)
diff --git a/paddlemix/examples/imagebind/run_predict.py b/paddlemix/examples/imagebind/run_predict.py
index a304a15403219..f195c74d8b8f9 100644
--- a/paddlemix/examples/imagebind/run_predict.py
+++ b/paddlemix/examples/imagebind/run_predict.py
@@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
 import os
-import sys
 from dataclasses import dataclass, field
 
-import numpy as np
 import paddle
 import requests
 from paddlenlp.trainer import PdArgumentParser
@@ -26,16 +23,13 @@
 from paddlemix import ImageBindModel, ImageBindProcessor
 from paddlemix.datasets import *
 from paddlemix.models import ModalityType
-from paddlemix.models.imagebind.modeling import ImageBindModel
 from paddlemix.utils.log import logger
 
 
 class Predictor:
     def __init__(self, model_args):
-        self.processor = ImageBindProcessor.from_pretrained(
-            model_args.model_name_or_path)
-        self.predictor = ImageBindModel.from_pretrained(
-            model_args.model_name_or_path)
+        self.processor = ImageBindProcessor.from_pretrained(model_args.model_name_or_path)
+        self.predictor = ImageBindModel.from_pretrained(model_args.model_name_or_path)
         self.predictor.eval()
 
     def run(self, inputs):
@@ -55,8 +49,7 @@ def main(model_args, data_args):
         # read image
         image_pil = Image.open(data_args.input_image).convert("RGB")
     elif url:
-        image_pil = Image.open(requests.get(url, stream=True).raw).convert(
-            "RGB")
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     else:
         image_pil = None
 
@@ -66,7 +59,8 @@ def main(model_args, data_args):
         images=image_pil,
         text=data_args.input_text,
         audios=data_args.input_audio,
-        return_tensors="pd", )
+        return_tensors="pd",
+    )
     inputs = {}
     if data_args.input_text:
         tokenized_processor = encoding["input_ids"]
@@ -84,8 +78,7 @@ def main(model_args, data_args):
     if data_args.input_text:
         logger.info("Generate text: {}".format(embeddings[ModalityType.TEXT]))
     if image_pil:
-        logger.info("Generate vision: {}".format(embeddings[
-            ModalityType.VISION]))
+        logger.info("Generate vision: {}".format(embeddings[ModalityType.VISION]))
     if data_args.input_audio:
         logger.info("Generate audio: {}".format(embeddings[ModalityType.AUDIO]))
 
@@ -99,17 +92,17 @@ class DataArguments:
     the command line.
     """
 
-    input_text: str = field(
-        default="A dog.",
-        metadata={"help": "The name of imagebind text input."})
+    input_text: str = field(default="A dog.", metadata={"help": "The name of imagebind text input."})
     input_image: str = field(
         default="",
         # wget https://github.com/facebookresearch/ImageBind/blob/main/.assets/bird_image.jpg
-        metadata={"help": "The name of imagebind image input."}, )
+        metadata={"help": "The name of imagebind image input."},
+    )
     input_audio: str = field(
         default=None,
         # wget https://github.com/facebookresearch/ImageBind/blob/main/.assets/bird_audio.wav
-        metadata={"help": "The name of imagebind audio input."}, )
+        metadata={"help": "The name of imagebind audio input."},
+    )
 
 
 @dataclass
@@ -120,14 +113,13 @@ class ModelArguments:
 
     model_name_or_path: str = field(
         default="imagebind-1.2b/",
-        metadata={"help": "Path to pretrained model or model identifier"}, )
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
 
     device: str = field(
         default="GPU",
-        metadata={
-            "help":
-            "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."
-        }, )
+        metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."},
+    )
 
 
 if __name__ == "__main__":
diff --git a/paddlemix/examples/minigpt4/merge_weight.py b/paddlemix/examples/minigpt4/merge_weight.py
index 5f06d130fb67e..e6a4035bf1816 100644
--- a/paddlemix/examples/minigpt4/merge_weight.py
+++ b/paddlemix/examples/minigpt4/merge_weight.py
@@ -28,11 +28,9 @@ def merge(args):
     # load the first item: blip2-flan-t5-xxl
     state_dict = paddle.load(args.blip2_path)
     for n, p in state_dict.items():
-        if (n.startswith("vision_model") or n.startswith("qformer") or
-                n == "query_tokens"):
+        if n.startswith("vision_model") or n.startswith("qformer") or n == "query_tokens":
             model_dict[n] = p
-    print(
-        "[1/3] load ViT, qformer and query_tokens from blip2-flan-t5-xxl done!")
+    print("[1/3] load ViT, qformer and query_tokens from blip2-flan-t5-xxl done!")
 
     # load the second item: vicuna
     llama_model = LlamaForCausalLM.from_pretrained(args.vicuna_path)
@@ -58,8 +56,7 @@ def merge(args):
                 new_p = paddle.to_tensor(p.cpu().numpy())
             model_dict[new_name] = new_p
 
-    print(
-        "[3/3] load language_projection, some llama weights from minigpt4 done!")
+    print("[3/3] load language_projection, some llama weights from minigpt4 done!")
 
     save_path = os.path.join(args.save_path, "model_state.pdparams")
     paddle.save(model_dict, save_path)
@@ -73,22 +70,26 @@ def merge(args):
         "--blip2_path",
         default="/blip2/dirname",
         type=str,
-        help="The dir name of blip2-flan-t5-xxl.", )
+        help="The dir name of blip2-flan-t5-xxl.",
+    )
     parser.add_argument(
         "--vicuna_path",
         default="/vicuna/dirname",
         type=str,
-        help="The dir name of vicuna.", )
+        help="The dir name of vicuna.",
+    )
     parser.add_argument(
         "--minigpt4_path",
         default="/minigpt4/prerained_minigpt4.pth",
         type=str,
-        help="The checkpoint path of vicuna.", )
+        help="The checkpoint path of vicuna.",
+    )
     parser.add_argument(
         "--save_path",
         default="/save/to/dirname",
         type=str,
-        help="The saving path of minigpt4.", )
+        help="The saving path of minigpt4.",
+    )
     args = parser.parse_args()
 
     args.blip2_path = os.path.join(args.blip2_path, "model_state.pdparams")
diff --git a/paddlemix/examples/minigpt4/run_predict.py b/paddlemix/examples/minigpt4/run_predict.py
index a883f611471dd..31da230530508 100644
--- a/paddlemix/examples/minigpt4/run_predict.py
+++ b/paddlemix/examples/minigpt4/run_predict.py
@@ -25,8 +25,7 @@
 
 def predict(args):
     # load MiniGPT4 moel and processor
-    model = MiniGPT4ForConditionalGeneration.from_pretrained(
-        args.pretrained_name_or_path)
+    model = MiniGPT4ForConditionalGeneration.from_pretrained(args.pretrained_name_or_path)
     model.eval()
     processor = MiniGPT4Processor.from_pretrained(args.pretrained_name_or_path)
     print("load processor and model done!")
@@ -61,7 +60,8 @@ def predict(args):
         "--pretrained_name_or_path",
         default="your directory of minigpt4",
         type=str,
-        help="The dir name of minigpt4 checkpoint.", )
+        help="The dir name of minigpt4 checkpoint.",
+    )
     args = parser.parse_args()
 
     predict(args)
diff --git a/paddlemix/examples/visualglm/run_predict.py b/paddlemix/examples/visualglm/run_predict.py
index a11a52f904c20..dabf2b4c5534e 100644
--- a/paddlemix/examples/visualglm/run_predict.py
+++ b/paddlemix/examples/visualglm/run_predict.py
@@ -26,8 +26,7 @@
 
 def predict(args):
     # load VisualGLM moel and processor
-    model = VisualGLMForConditionalGeneration.from_pretrained(
-        args.pretrained_name_or_path, dtype="float16")
+    model = VisualGLMForConditionalGeneration.from_pretrained(args.pretrained_name_or_path, dtype="float16")
     model.eval()
     processor = VisualGLMProcessor.from_pretrained(args.pretrained_name_or_path)
     print("load processor and model done!")
@@ -70,7 +69,8 @@ def predict(args):
         "--pretrained_name_or_path",
         default="THUDM/visualglm-6b",
         type=str,
-        help="The dir name of visualglm checkpoint.", )
+        help="The dir name of visualglm checkpoint.",
+    )
     args = parser.parse_args()
 
     predict(args)
diff --git a/paddlemix/external_ops/setup.py b/paddlemix/external_ops/setup.py
index 7b1fa658805dc..2c5cd345c21e6 100644
--- a/paddlemix/external_ops/setup.py
+++ b/paddlemix/external_ops/setup.py
@@ -63,8 +63,11 @@ def setup_fast_ln():
                     "--expt-relaxed-constexpr",
                     "--expt-extended-lambda",
                     "--use_fast_math",
-                ] + gencode_flags,
-            }, ), )
+                ]
+                + gencode_flags,
+            },
+        ),
+    )
 
 
 def setup_fused_ln():
@@ -75,7 +78,9 @@ def setup_fused_ln():
     setup(
         name="fused_ln",
         ext_modules=CUDAExtension(
-            sources=["fused_ln/layer_norm_cuda.cu", ],
+            sources=[
+                "fused_ln/layer_norm_cuda.cu",
+            ],
             extra_compile_args={
                 "cxx": ["-O3"],
                 "nvcc": [
@@ -91,8 +96,11 @@ def setup_fused_ln():
                     "--expt-extended-lambda",
                     "--use_fast_math",
                     "-maxrregcount=50",
-                ] + gencode_flags,
-            }, ), )
+                ]
+                + gencode_flags,
+            },
+        ),
+    )
 
 
 run(setup_fast_ln)
diff --git a/paddlemix/metrics/clip_zero_shot.py b/paddlemix/metrics/clip_zero_shot.py
index c7271cd2f1f4e..f3037795883c0 100644
--- a/paddlemix/metrics/clip_zero_shot.py
+++ b/paddlemix/metrics/clip_zero_shot.py
@@ -21,29 +21,24 @@
 from paddlemix.processors.tokenizer import tokenize
 
 
-def zero_shot_classifier(model,
-                         classnames_filename,
-                         templates_filename,
-                         args,
-                         text_tower=None):
+def zero_shot_classifier(model, classnames_filename, templates_filename, args, text_tower=None):
     classnames = [i.strip() for i in open(classnames_filename).readlines()]
     templates = [i.strip() for i in open(templates_filename).readlines()]
 
     if text_tower is None:
         if hasattr(model, "_layers"):
-            text_tower = (model._layers.module.encode_text
-                          if not hasattr(model._layers, "encode_text") else
-                          model._layers.encode_text)
+            text_tower = (
+                model._layers.module.encode_text
+                if not hasattr(model._layers, "encode_text")
+                else model._layers.encode_text
+            )
         else:
-            text_tower = (model.module.encode_text
-                          if not hasattr(model, "encode_text") else
-                          model.encode_text)
+            text_tower = model.module.encode_text if not hasattr(model, "encode_text") else model.encode_text
     tokenizer = tokenize
     with paddle.no_grad():
         zeroshot_weights = []
         for classname in tqdm(classnames):
-            texts = [template.format(classname)
-                     for template in templates]  # format with class
+            texts = [template.format(classname) for template in templates]  # format with class
             texts = tokenizer(texts)  # tokenize
 
             class_embeddings = text_tower(texts)
@@ -54,13 +49,10 @@ def zero_shot_classifier(model,
     return zeroshot_weights
 
 
-def accuracy(output, target, topk=(1, )):
+def accuracy(output, target, topk=(1,)):
     pred = output.topk(max(topk), 1, True, True)[1].t()
     correct = pred.equal(target.reshape([1, -1]).expand_as(pred))
-    return [
-        float(correct[:k].reshape([-1]).astype(paddle.float32)
-              .sum(0, keepdim=True).numpy()) for k in topk
-    ]
+    return [float(correct[:k].reshape([-1]).astype(paddle.float32).sum(0, keepdim=True).numpy()) for k in topk]
 
 
 class DummyAutocast:
@@ -97,8 +89,7 @@ def run(model, classifier, dataloader, args):
     autocast = get_autocast(cast_dtype)
     with paddle.no_grad():
         top1, top5, n = 0.0, 0.0, 0.0
-        for images, target in tqdm(
-                dataloader, unit_scale=args.per_device_eval_batch_size):
+        for images, target in tqdm(dataloader, unit_scale=args.per_device_eval_batch_size):
             if cast_dtype is not None:
                 images = images.cast(cast_dtype)
             target = target
@@ -109,11 +100,11 @@ def run(model, classifier, dataloader, args):
                 else:
                     image_features = model.encode_image(images)
                 image_features = F.normalize(image_features, axis=-1)
-                logits = 100.0 * image_features @classifier
+                logits = 100.0 * image_features @ classifier
 
             # measure accuracy
             if logits.shape[-1] < 5:
-                (acc1, ) = accuracy(logits, target, topk=(1, ))
+                (acc1,) = accuracy(logits, target, topk=(1,))
                 acc5 = -1
             else:
                 acc1, acc5 = accuracy(logits, target, topk=(1, 5))
@@ -133,14 +124,15 @@ def zero_shot_eval(model, data, args):
     for k, v in data.items():
         if "eval/classification" in k:
             data_name = os.path.basename(k)
-            classifier_filename = f"{os.path.dirname(v.classname_filename)}/{args.pretrained_text_model}_{data_name}_classifier.pt"
+            classifier_filename = (
+                f"{os.path.dirname(v.classname_filename)}/{args.pretrained_text_model}_{data_name}_classifier.pt"
+            )
             if os.path.exists(classifier_filename):
                 print("load classifier from disk")
                 classifier = paddle.load(classifier_filename)
             else:
                 print("constructing classifier.")
-                classifier = zero_shot_classifier(model, v.classname_filename,
-                                                  v.template_filename, args)
+                classifier = zero_shot_classifier(model, v.classname_filename, v.template_filename, args)
                 paddle.save(classifier, classifier_filename)
             print(f"zero-shot evaluating classification task: {data_name}")
             if args.bf16:
@@ -154,9 +146,7 @@ def zero_shot_eval(model, data, args):
 
             # FIXME: DEBUG ONLY
             results[f"{k}-top1"] = top1
-            print(
-                f"zero-shot classification task: {data_name}: top1: {top1}, top5: {top5}"
-            )
+            print(f"zero-shot classification task: {data_name}: top1: {top1}, top5: {top5}")
 
     print("Finished zero-shot evaluation.")
 
diff --git a/paddlemix/models/blip2/Qformer.py b/paddlemix/models/blip2/Qformer.py
index 32b86642170f8..979ecf7857407 100644
--- a/paddlemix/models/blip2/Qformer.py
+++ b/paddlemix/models/blip2/Qformer.py
@@ -13,31 +13,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sklearn
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Dict, Any
 import inspect
-import numpy as np
+import math
+from typing import Tuple
 
+import numpy as np
 import paddle
-from paddle import Tensor, device, dtype, nn
-from paddle.nn import CrossEntropyLoss
 import paddle.nn.functional as F
+from paddle import Tensor, device, nn
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-
+from paddle.distributed.fleet.utils import recompute
 from paddlenlp.transformers.activations import ACT2FN
+from paddlenlp.transformers.bert.configuration import BertConfig
 from paddlenlp.transformers.model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput)
-
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+)
 from paddlenlp.transformers.model_utils import PretrainedModel
-from paddlenlp.transformers.bert.configuration import BertConfig
-import numpy as np
-import paddle
-from paddle.distributed.fleet.utils import recompute
 
 
 class CrossEntropyLoss(nn.Layer):
@@ -45,7 +40,7 @@ class CrossEntropyLoss(nn.Layer):
     Softmax Cross entropy loss
     """
 
-    def __init__(self, reduction='mean', label_smoothing=None):
+    def __init__(self, reduction="mean", label_smoothing=None):
         super().__init__()
         if label_smoothing is not None:
             assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]"
@@ -75,12 +70,12 @@ def forward(self, x, label):
                 loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1)
             else:
                 if label.dtype == paddle.int32:
-                    label = paddle.cast(label, 'int64')
+                    label = paddle.cast(label, "int64")
                 loss = F.cross_entropy(x, label=label, soft_label=False)
 
-        if self.reduction == 'sum':
+        if self.reduction == "sum":
             return loss.sum()
-        elif self.reduction == 'mean':
+        elif self.reduction == "mean":
             return loss.mean()
         else:
             return loss
@@ -93,40 +88,29 @@ class BertEmbeddings(nn.Layer):
 
     def __init__(self, config):
         super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.register_buffer("position_ids",
-                             paddle.expand(
-                                 paddle.arange(config.max_position_embeddings),
-                                 [1, -1]))
-        self.position_embedding_type = getattr(
-            config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", paddle.expand(paddle.arange(config.max_position_embeddings), [1, -1]))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.mp_degree = config.mp_degree
 
     def forward(
-            self,
-            input_ids=None,
-            position_ids=None,
-            query_embeds=None,
-            past_key_values_length=0, ):
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
         if input_ids is not None:
             seq_length = input_ids.shape[1]
         else:
             seq_length = 0
 
         if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length:
-                                             seq_length +
-                                             past_key_values_length]
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
         if input_ids is not None:
             embeddings = self.word_embeddings(input_ids)
@@ -151,91 +135,63 @@ class BertSelfAttention(nn.Layer):
     def __init__(self, config, is_cross_attention):
         super().__init__()
         self.config = config
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, "embedding_size"):
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
 
         self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size /
-                                       config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         if config.mp_degree > 1:
             self.query = fleet.meta_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                self.all_head_size,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True)
+                config.hidden_size, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             if config.use_fusedlinear:
-                self.query = paddle.incubate.nn.FusedLinear(config.hidden_size,
-                                                            self.all_head_size)
+                self.query = paddle.incubate.nn.FusedLinear(config.hidden_size, self.all_head_size)
             else:
                 self.query = nn.Linear(config.hidden_size, self.all_head_size)
 
         if is_cross_attention:
             if config.mp_degree > 1:
                 self.key = fleet.meta_parallel.ColumnParallelLinear(
-                    config.encoder_width,
-                    self.all_head_size,
-                    weight_attr=None,
-                    has_bias=True,
-                    gather_output=True)
+                    config.encoder_width, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True
+                )
                 self.value = fleet.meta_parallel.ColumnParallelLinear(
-                    config.encoder_width,
-                    self.all_head_size,
-                    weight_attr=None,
-                    has_bias=True,
-                    gather_output=True)
+                    config.encoder_width, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True
+                )
             else:
                 if config.use_fusedlinear:
-                    self.key = paddle.incubate.nn.FusedLinear(
-                        config.encoder_width, self.all_head_size)
-                    self.value = paddle.incubate.nn.FusedLinear(
-                        config.encoder_width, self.all_head_size)
+                    self.key = paddle.incubate.nn.FusedLinear(config.encoder_width, self.all_head_size)
+                    self.value = paddle.incubate.nn.FusedLinear(config.encoder_width, self.all_head_size)
                 else:
-                    self.key = nn.Linear(config.encoder_width,
-                                         self.all_head_size)
-                    self.value = nn.Linear(config.encoder_width,
-                                           self.all_head_size)
+                    self.key = nn.Linear(config.encoder_width, self.all_head_size)
+                    self.value = nn.Linear(config.encoder_width, self.all_head_size)
 
         else:
             if config.mp_degree > 1:
                 self.key = fleet.meta_parallel.ColumnParallelLinear(
-                    config.hidden_size,
-                    self.all_head_size,
-                    weight_attr=None,
-                    has_bias=True,
-                    gather_output=True)
+                    config.hidden_size, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True
+                )
                 self.value = fleet.meta_parallel.ColumnParallelLinear(
-                    config.hidden_size,
-                    self.all_head_size,
-                    weight_attr=None,
-                    has_bias=True,
-                    gather_output=True)
+                    config.hidden_size, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True
+                )
             else:
                 if config.use_fusedlinear:
-                    self.key = paddle.incubate.nn.FusedLinear(
-                        config.hidden_size, self.all_head_size)
-                    self.value = paddle.incubate.nn.FusedLinear(
-                        config.hidden_size, self.all_head_size)
+                    self.key = paddle.incubate.nn.FusedLinear(config.hidden_size, self.all_head_size)
+                    self.value = paddle.incubate.nn.FusedLinear(config.hidden_size, self.all_head_size)
                 else:
                     self.key = nn.Linear(config.hidden_size, self.all_head_size)
-                    self.value = nn.Linear(config.hidden_size,
-                                           self.all_head_size)
+                    self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.mp_degree = config.mp_degree
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(
-            config, "position_embedding_type", "absolute")
-        if (self.position_embedding_type == "relative_key" or
-                self.position_embedding_type == "relative_key_query"):
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
         self.save_attention = False
 
     def save_attn_gradients(self, attn_gradients):
@@ -260,14 +216,15 @@ def transpose_for_scores(self, x):
         return x.transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
 
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
@@ -275,17 +232,14 @@ def forward(
         is_cross_attention = encoder_hidden_states is not None
 
         if is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
-            value_layer = paddle.concat(
-                [past_key_value[1], value_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
         else:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -296,34 +250,24 @@ def forward(
 
         past_key_value = (key_layer, value_layer)
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = paddle.matmul(query_layer,
-                                         key_layer.transpose([0, 1, 3, 2]))
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
 
-        if (self.position_embedding_type == "relative_key" or
-                self.position_embedding_type == "relative_key_query"):
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             seq_length = hidden_states.size()[1]
             position_ids_l = paddle.arange(seq_length).reshape([-1, 1])
             position_ids_r = paddle.arange(seq_length).reshape([1, -1])
             distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.cast(
-                query_layer.dtype)  # fp16 compatibility
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(query_layer.dtype)  # fp16 compatibility
 
             if self.position_embedding_type == "relative_key":
-                relative_position_scores = paddle.einsum(
-                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = paddle.einsum(
-                    "bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = paddle.einsum(
-                    "bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = (
-                    attention_scores + relative_position_scores_query +
-                    relative_position_scores_key)
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
             attention_scores = attention_scores + attention_mask
@@ -350,15 +294,12 @@ def forward(
         context_layer = paddle.matmul(attention_probs_dropped, value_layer)
 
         context_layer = context_layer.transpose([0, 2, 1, 3])
-        new_context_layer_shape = context_layer.shape[:-2] + [
-            self.all_head_size
-        ]
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
         context_layer = context_layer.reshape(new_context_layer_shape)
 
-        outputs = ((context_layer, attention_probs)
-                   if output_attentions else (context_layer, ))
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        outputs = outputs + (past_key_value, )
+        outputs = outputs + (past_key_value,)
         return outputs
 
 
@@ -366,12 +307,10 @@ class BertSelfOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         if config.use_fusedlinear:
-            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size,
-                                                        config.hidden_size)
+            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.hidden_size)
         else:
             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.mp_degree = config.mp_degree
 
@@ -393,14 +332,15 @@ def __init__(self, config, is_cross_attention=False):
         self.output = BertSelfOutput(config)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
         self_outputs = self.self(
             hidden_states,
             attention_mask,
@@ -408,11 +348,11 @@ def forward(
             encoder_hidden_states,
             encoder_attention_mask,
             past_key_value,
-            output_attentions, )
+            output_attentions,
+        )
         attention_output = self.output(self_outputs[0], hidden_states)
 
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
 
@@ -420,8 +360,7 @@ class BertIntermediate(nn.Layer):
     def __init__(self, config):
         super().__init__()
         if config.use_fusedlinear:
-            self.dense = paddle.incubate.nn.FusedLinear(
-                config.hidden_size, config.intermediate_size)
+            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.intermediate_size)
         else:
             self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
         if isinstance(config.hidden_act, str):
@@ -439,12 +378,10 @@ class BertOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         if config.use_fusedlinear:
-            self.dense = paddle.incubate.nn.FusedLinear(
-                config.intermediate_size, config.hidden_size)
+            self.dense = paddle.incubate.nn.FusedLinear(config.intermediate_size, config.hidden_size)
         else:
             self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.mp_degree = config.mp_degree
 
@@ -467,10 +404,8 @@ def __init__(self, config, layer_num):
         self.seq_len_dim = 1
         self.attention = BertAttention(config)
         self.layer_num = layer_num
-        if (self.config.add_cross_attention and
-                layer_num % self.config.cross_attention_freq == 0):
-            self.crossattention = BertAttention(
-                config, is_cross_attention=self.config.add_cross_attention)
+        if self.config.add_cross_attention and layer_num % self.config.cross_attention_freq == 0:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
             self.has_cross_attention = True
         else:
             self.has_cross_attention = False
@@ -480,25 +415,27 @@ def __init__(self, config, layer_num):
         self.intermediate_query = BertIntermediate(config)
         self.output_query = BertOutput(config)
 
-    def forward(self,
-                hidden_states=None,
-                attention_mask=None,
-                head_mask=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_value=None,
-                output_attentions=False,
-                query_length=0,
-                **kwargs):
+    def forward(
+        self,
+        hidden_states=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+        **kwargs
+    ):
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = (past_key_value[:2]
-                                    if past_key_value is not None else None)
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         self_attention_outputs = self.attention(
             hidden_states,
             attention_mask,
             head_mask,
             output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value, )
+            past_key_value=self_attn_past_key_value,
+        )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:-1]
 
@@ -517,7 +454,8 @@ def forward(self,
                     head_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
-                    output_attentions=output_attentions, )
+                    output_attentions=output_attentions,
+                )
                 query_attention_output = cross_attention_outputs[0]
                 outputs = (
                     outputs + cross_attention_outputs[1:-1]
@@ -527,59 +465,59 @@ def forward(self,
                 self.feed_forward_chunk_query,
                 self.chunk_size_feed_forward,
                 self.seq_len_dim,
-                query_attention_output, )
+                query_attention_output,
+            )
             if attention_output.shape[1] > query_length:
                 layer_output_text = self.apply_chunking_to_forward(
                     self.feed_forward_chunk,
                     self.chunk_size_feed_forward,
                     self.seq_len_dim,
-                    attention_output[:, query_length:, :], )
-                layer_output = paddle.concat(
-                    [layer_output, layer_output_text], axis=1)
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
         else:
             layer_output = self.apply_chunking_to_forward(
                 self.feed_forward_chunk,
                 self.chunk_size_feed_forward,
                 self.seq_len_dim,
-                attention_output, )
-        outputs = (layer_output, ) + outputs
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
 
-        outputs = outputs + (present_key_value, )
+        outputs = outputs + (present_key_value,)
 
         return outputs
 
-    def apply_chunking_to_forward(self, forward_fn, chunk_size, chunk_dim,
-                                  *input_tensors):
-        assert len(
-            input_tensors) > 0, "{0} has to be a tuple/list of tensors".format(
-                input_tensors)
+    def apply_chunking_to_forward(self, forward_fn, chunk_size, chunk_dim, *input_tensors):
+        assert len(input_tensors) > 0, "{0} has to be a tuple/list of tensors".format(input_tensors)
 
         # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
-        num_args_in_forward_chunk_fn = len(
-            inspect.signature(forward_fn).parameters)
+        num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
         if num_args_in_forward_chunk_fn != len(input_tensors):
             raise ValueError(
                 f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
-                "tensors are given")
+                "tensors are given"
+            )
         if chunk_size > 0:
             tensor_shape = input_tensors[0].shape[chunk_dim]
             for input_tensor in input_tensors:
                 if input_tensor.shape[chunk_dim] != tensor_shape:
                     raise ValueError(
                         f"All input tenors have to be of the same shape: {tensor_shape}, "
-                        f"found shape {input_tensor.shape[chunk_dim]}")
+                        f"found shape {input_tensor.shape[chunk_dim]}"
+                    )
             if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
                 raise ValueError(
                     f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
-                    f"size {chunk_size}")
+                    f"size {chunk_size}"
+                )
             num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
             input_tensors_chunks = tuple(
-                input_tensor.chunk(
-                    num_chunks, axis=chunk_dim)
-                for input_tensor in input_tensors)
+                input_tensor.chunk(num_chunks, axis=chunk_dim) for input_tensor in input_tensors
+            )
             output_chunks = tuple(
-                forward_fn(*input_tensors_chunk)
-                for input_tensors_chunk in zip(*input_tensors_chunks))
+                forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)
+            )
             return paddle.concat(output_chunks, axis=chunk_dim)
         return forward_fn(*input_tensors)
 
@@ -598,56 +536,63 @@ class BertEncoder(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.LayerList(
-            [BertLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.layer = nn.LayerList([BertLayer(config, i) for i in range(config.num_hidden_layers)])
         self.gradient_checkpointing = config.gradient_checkpointing
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
-            query_length=0, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (() if output_attentions and
-                                self.config.add_cross_attention else None)
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
         next_decoder_cache = () if use_cache else None
         # cuda_state = paddle.get_cuda_rng_state()
         # paddle.set_cuda_rng_state(cuda_state)
         # print("qformergradient_checkpointing:{}".format(self.gradient_checkpointing))
-        for i in range(self.config.num_hidden_layers):  #add recompute
+        for i in range(self.config.num_hidden_layers):  # add recompute
             layer_module = self.layer[i]
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
-                        return module(*inputs, )
+                        return module(
+                            *inputs,
+                        )
 
                     return custom_forward
 
                 layer_outputs = recompute(
                     create_custom_forward(layer_module),
-                    *(hidden_states, attention_mask, layer_head_mask,
-                      encoder_hidden_states, encoder_attention_mask,
-                      past_key_value, output_attentions, query_length),
-                    **{"preserve_rng_state": True,
-                       "use_reentrant": False})
+                    *(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        past_key_value,
+                        output_attentions,
+                        query_length,
+                    ),
+                    **{"preserve_rng_state": True, "use_reentrant": False},
+                )
 
             else:
                 layer_outputs = layer_module(
@@ -658,34 +603,38 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                     past_key_value,
                     output_attentions,
-                    query_length, )
+                    query_length,
+                )
 
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
+                next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1], )
-                all_cross_attentions = all_cross_attentions + (layer_outputs[2],
-                                                               )
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v
-                         for v in [
-                             hidden_states,
-                             next_decoder_cache,
-                             all_hidden_states,
-                             all_self_attentions,
-                             all_cross_attentions,
-                         ] if v is not None)
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
         return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions, )
+            cross_attentions=all_cross_attentions,
+        )
 
 
 class BertPooler(nn.Layer):
@@ -701,8 +650,7 @@ def __init__(self, config: BertConfig):
         """
         super(BertPooler, self).__init__()
         if config.use_fusedlinear:
-            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size,
-                                                        config.hidden_size)
+            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.hidden_size)
         else:
             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.activation = nn.Tanh()
@@ -718,8 +666,7 @@ class BertPredictionHeadTransform(nn.Layer):
     def __init__(self, config):
         super().__init__()
         if config.use_fusedlinear:
-            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size,
-                                                        config.hidden_size)
+            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.hidden_size)
         else:
             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         # self.dense = fleet.meta_parallel.ColumnParallelLinear(config.hidden_size, config.hidden_size,weight_attr=None,
@@ -729,8 +676,7 @@ def __init__(self, config):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -747,11 +693,9 @@ def __init__(self, config):
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         if config.use_fusedlinear:
-            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size,
-                                                        config.vocab_size)
+            self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.vocab_size)
         else:
-            self.decoder = nn.Linear(
-                config.hidden_size, config.vocab_size, bias_attr=False)
+            self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
 
         # # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
         # self.decoder.bias = self.bias
@@ -759,9 +703,8 @@ def __init__(self, config):
         initializer = paddle.nn.initializer.Constant(value=0.0)
         bias_data = paddle.zeros([config.vocab_size])
         self.bias = self.create_parameter(
-            shape=[config.vocab_size],
-            dtype='float32',
-            default_initializer=initializer(bias_data))
+            shape=[config.vocab_size], dtype="float32", default_initializer=initializer(bias_data)
+        )
 
         # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
         self.decoder.bias = self.bias
@@ -780,8 +723,7 @@ def __init__(self, config):
 
     def forward(self, sequence_output, word_embeddings):
         prediction_scores = self.predictions(sequence_output)
-        prediction_scores = prediction_scores @word_embeddings.weight.t(
-        ) + self.predictions.bias
+        prediction_scores = prediction_scores @ word_embeddings.weight.t() + self.predictions.bias
         return prediction_scores
 
 
@@ -854,11 +796,9 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
             `torch.Tensor`: The inverted attention mask.
         """
         if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:,
-                                                                     None, :, :]
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
         if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None,
-                                                                     None, :]
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
         # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
         # /transformer/transformer_layers.py#L270
@@ -866,18 +806,18 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
         # encoder_extended_attention_mask.transpose(-1, -2))
         # encoder_extended_attention_mask = encoder_extended_attention_mask.cast(dtype=encoder_attention_mask.dtype)  # fp16 compatibility
 
-        encoder_extended_attention_mask = (
-            1.0 - encoder_extended_attention_mask) * np.finfo('float32').min
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * np.finfo("float32").min
 
         return encoder_extended_attention_mask
 
     def get_extended_attention_mask(
-            self,
-            attention_mask: Tensor,
-            input_shape: Tuple[int],
-            device: device,
-            is_decoder: bool,
-            has_query: bool=False, ) -> Tensor:
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
 
@@ -904,58 +844,57 @@ def get_extended_attention_mask(
                 batch_size, seq_length = input_shape
 
                 seq_ids = paddle.arange(seq_length)
-                causal_mask = (paddle.tile(seq_ids[None, None, :],
-                                           [batch_size, seq_length, 1]) <=
-                               seq_ids[None, :, None])
+                causal_mask = (
+                    paddle.tile(seq_ids[None, None, :], [batch_size, seq_length, 1]) <= seq_ids[None, :, None]
+                )
 
                 # add a prefix ones mask to the causal mask
                 # causal and attention masks must have same type with pytorch version < 1.3
                 causal_mask = causal_mask.cast(attention_mask.dtype)
 
                 if causal_mask.shape[1] < attention_mask.shape[1]:
-                    prefix_seq_len = attention_mask.shape[
-                        1] - causal_mask.shape[1]
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
                     if has_query:  # UniLM style attention mask
                         causal_mask = paddle.concat(
                             [
                                 paddle.zeros(
                                     (batch_size, prefix_seq_len, seq_length),
-                                    dtype=causal_mask.dtype, ),
+                                    dtype=causal_mask.dtype,
+                                ),
                                 causal_mask,
                             ],
-                            axis=1, )
+                            axis=1,
+                        )
                     causal_mask = paddle.concat(
                         [
                             paddle.ones(
-                                (batch_size, causal_mask.shape[1],
-                                 prefix_seq_len),
-                                dtype=causal_mask.dtype, ),
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                dtype=causal_mask.dtype,
+                            ),
                             causal_mask,
                         ],
-                        axis=-1, )
-                extended_attention_mask = (causal_mask[:, None, :, :] *
-                                           attention_mask[:, None, None, :])
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
         else:
             raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".
-                format(input_shape, attention_mask.shape))
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.cast(
-            self.config.dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.cast(self.config.dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         return extended_attention_mask
 
-    def get_head_mask(self,
-                      head_mask,
-                      num_hidden_layers,
-                      is_attention_chunked=False) -> Tensor:
+    def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False) -> Tensor:
         """
         Prepare the head mask if needed.
 
@@ -972,8 +911,7 @@ def get_head_mask(self,
             `[None]` for each layer.
         """
         if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask,
-                                                      num_hidden_layers)
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
             if is_attention_chunked is True:
                 head_mask = head_mask.unsqueeze(-1)
         else:
@@ -984,32 +922,30 @@ def get_head_mask(self,
     def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
         """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
         if head_mask.dim() == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-                -1).unsqueeze(-1)
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
             head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
         elif head_mask.dim() == 2:
-            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-                -1)  # We can specify head_mask for each layer
-        assert head_mask.dim(
-        ) == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
         # head_mask = head_mask.to(dtype=num_hidden_layers.dtype)  # switch to float if need + fp16 compatibility
         return head_mask
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            position_ids=None,
-            head_mask=None,
-            query_embeds=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
-            is_decoder=False, ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
         r"""
         encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -1028,24 +964,21 @@ def forward(
             If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
             decoding (see :obj:`past_key_values`).
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # use_cache = use_cache if use_cache is not None else self.config.use_cache
 
         if input_ids is None:
-            assert (query_embeds is not None
-                    ), "You have to specify query_embeds when input_ids is None"
+            assert query_embeds is not None, "You have to specify query_embeds when input_ids is None"
 
         # past_key_values_length
         past_key_values_length = (
-            past_key_values[0][0].shape[2] - self.config.query_length
-            if past_key_values is not None else 0)
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
 
         query_length = query_embeds.shape[1] if query_embeds is not None else 0
 
@@ -1053,14 +986,14 @@ def forward(
             input_ids=input_ids,
             position_ids=position_ids,
             query_embeds=query_embeds,
-            past_key_values_length=past_key_values_length, )
+            past_key_values_length=past_key_values_length,
+        )
 
         input_shape = embedding_output.shape[:-1]
         batch_size, seq_length = input_shape
 
         if attention_mask is None:
-            attention_mask = paddle.ones((
-                (batch_size, seq_length + past_key_values_length)))
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
@@ -1070,34 +1003,27 @@ def forward(
                 input_ids.shape,
                 device,
                 is_decoder,
-                has_query=(query_embeds is not None), )
+                has_query=(query_embeds is not None),
+            )
         else:
-            extended_attention_mask = self.get_extended_attention_mask(
-                attention_mask, input_shape, device, is_decoder)
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder)
 
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if encoder_hidden_states is not None:
             if type(encoder_hidden_states) == list:
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
-                    0].shape
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
             else:
-                [encoder_batch_size, encoder_sequence_length,
-                 _] = encoder_hidden_states.shape
+                [encoder_batch_size, encoder_sequence_length, _] = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
 
             if type(encoder_attention_mask) == list:
-                encoder_extended_attention_mask = [
-                    self.invert_attention_mask(mask)
-                    for mask in encoder_attention_mask
-                ]
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
             elif encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(encoder_hidden_shape)
-                encoder_extended_attention_mask = self.invert_attention_mask(
-                    encoder_attention_mask)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
             else:
-                encoder_extended_attention_mask = self.invert_attention_mask(
-                    encoder_attention_mask)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
 
@@ -1119,10 +1045,10 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            query_length=query_length, )
+            query_length=query_length,
+        )
         sequence_output = encoder_outputs[0]
-        pooled_output = (self.pooler(sequence_output)
-                         if self.pooler is not None else None)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
@@ -1133,23 +1059,18 @@ def forward(
             past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions, )
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
 
 
 class BertLMHeadModel(BertPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids", r"predictions.decoder.bias"
-    ]
-
-    def __init__(self,
-                 config,
-                 encoder_width=None,
-                 train_in_satge1=False,
-                 **kwargs):
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config, encoder_width=None, train_in_satge1=False, **kwargs):
         super().__init__(config)
-        from paddle.distributed import fleet
+
         config.mp_degree = kwargs.get("mp_degree")
         config.encoder_width = encoder_width
         config.gradient_checkpointing = False
@@ -1161,21 +1082,17 @@ def __init__(self,
 
         self.query_tokens = paddle.create_parameter(
             shape=(1, config.num_query_tokens, config.hidden_size),
-            dtype='float32',
-            default_initializer=paddle.nn.initializer.Normal(
-                mean=0.0, std=config.initializer_range))
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Normal(mean=0.0, std=config.initializer_range),
+        )
         if train_in_satge1:
-            self.vision_proj = paddle.nn.Linear(
-                in_features=config.hidden_size, out_features=config.embed_dim)
-            self.text_proj = paddle.nn.Linear(
-                in_features=config.hidden_size, out_features=config.embed_dim)
-            self.itm_head = paddle.nn.Linear(
-                in_features=config.hidden_size, out_features=2)
-            self.resize_token_embeddings(kwargs.get('tokenizer_length'))
+            self.vision_proj = paddle.nn.Linear(in_features=config.hidden_size, out_features=config.embed_dim)
+            self.text_proj = paddle.nn.Linear(in_features=config.hidden_size, out_features=config.embed_dim)
+            self.itm_head = paddle.nn.Linear(in_features=config.hidden_size, out_features=2)
+            self.resize_token_embeddings(kwargs.get("tokenizer_length"))
         else:
-            text_hidden_size = kwargs.get('text_hidden_size')
-            self.language_projection = paddle.nn.Linear(
-                in_features=config.hidden_size, out_features=text_hidden_size)
+            text_hidden_size = kwargs.get("text_hidden_size")
+            self.language_projection = paddle.nn.Linear(in_features=config.hidden_size, out_features=text_hidden_size)
 
         # self.init_weights()
 
@@ -1186,23 +1103,24 @@ def set_output_embeddings(self, new_embeddings):
         self.cls.predictions.decoder = new_embeddings
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            position_ids=None,
-            head_mask=None,
-            query_embeds=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            labels=None,
-            past_key_values=None,
-            use_cache=True,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
-            return_logits=False,
-            is_decoder=True,
-            reduction="mean", ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
         r"""
         encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -1235,8 +1153,7 @@ def forward(
             >>> outputs = model(**inputs)
             >>> prediction_logits = outputs.logits
         """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
             use_cache = False
         if past_key_values is not None:
@@ -1255,14 +1172,14 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            is_decoder=is_decoder, )
+            is_decoder=is_decoder,
+        )
 
         sequence_output = outputs[0]
         if query_embeds is not None:
-            sequence_output = outputs[0][:, query_embeds.shape[1]:, :]
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
 
-        prediction_scores = self.cls(sequence_output,
-                                     self.bert.embeddings.word_embeddings)
+        prediction_scores = self.cls(sequence_output, self.bert.embeddings.word_embeddings)
 
         if return_logits:
             return prediction_scores[:, :-1, :]
@@ -1274,23 +1191,20 @@ def forward(
             labels = labels[:, 1:]
             labels = labels.flatten()
             # loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
-            loss_fct = CrossEntropyLoss(
-                reduction=reduction, label_smoothing=0.1)
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
 
             valid_index = paddle.where(labels != -100)[0].flatten()
-            logits = shifted_prediction_scores.reshape(
-                (-1, self.config.vocab_size))
+            logits = shifted_prediction_scores.reshape((-1, self.config.vocab_size))
             logits = paddle.gather(logits, valid_index, axis=0)
             labels = paddle.gather(labels, valid_index, axis=0)
 
             lm_loss = loss_fct(logits, labels)
             if reduction == "none":
-                lm_loss = lm_loss.reshape(
-                    [prediction_scores.shape(0), -1]).sum(1)
+                lm_loss = lm_loss.reshape([prediction_scores.shape(0), -1]).sum(1)
 
         if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
 
         return CausalLMOutputWithCrossAttentions(
             loss=lm_loss,
@@ -1298,14 +1212,10 @@ def forward(
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions, )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      query_embeds,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs):
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
         # if attention_mask is None:
         #     attention_mask = input_ids.new_ones(input_ids.shape)
@@ -1325,28 +1235,22 @@ def prepare_inputs_for_generation(self,
             "query_embeds": query_embeds,
             "attention_mask": attention_mask,
             "past_key_values": past,
-            "encoder_hidden_states":
-            model_kwargs.get("encoder_hidden_states", None),
-            "encoder_attention_mask":
-            model_kwargs.get("encoder_attention_mask", None),
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
             "is_decoder": True,
         }
 
     def _reorder_cache(self, past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
 
 
 class BertForMaskedLM(BertPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids", r"predictions.decoder.bias"
-    ]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1363,20 +1267,21 @@ def set_output_embeddings(self, new_embeddings):
         self.cls.predictions.decoder = new_embeddings
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            position_ids=None,
-            head_mask=None,
-            query_embeds=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            labels=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
-            return_logits=False,
-            is_decoder=False, ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
@@ -1384,8 +1289,7 @@ def forward(
             (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
         """
 
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.bert(
             input_ids,
@@ -1398,10 +1302,11 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            is_decoder=is_decoder, )
+            is_decoder=is_decoder,
+        )
 
         if query_embeds is not None:
-            sequence_output = outputs[0][:, query_embeds.shape[1]:, :]
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
         prediction_scores = self.cls(sequence_output)
 
         if return_logits:
@@ -1410,26 +1315,24 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.reshape([-1, self.config.vocab_size]),
-                labels.reshape([-1]))
+            masked_lm_loss = loss_fct(prediction_scores.reshape([-1, self.config.vocab_size]), labels.reshape([-1]))
 
         if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return (((masked_lm_loss, ) + output)
-                    if masked_lm_loss is not None else output)
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
         return MaskedLMOutput(
             loss=masked_lm_loss,
             logits=prediction_scores,
             hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions, )
+            attentions=outputs.attentions,
+        )
 
 
 def prune_linear_layer(layer, index_to_prune, dim=0):
     index = paddle.to_tensor(index_to_prune)
     num_dims = len(layer.weight.shape)
-    index_expanded = index.expand((layer.weight.shape[dim], )).T
+    index_expanded = index.expand((layer.weight.shape[dim],)).T
 
     if dim != 0:
         perm = list(range(num_dims))
@@ -1447,8 +1350,7 @@ def prune_linear_layer(layer, index_to_prune, dim=0):
     return layer
 
 
-def find_pruneable_heads_and_indices(heads, n_heads, head_size,
-                                     already_pruned_heads):
+def find_pruneable_heads_and_indices(heads, n_heads, head_size, already_pruned_heads):
     """
     Finds the heads and their indices taking `already_pruned_heads` into account.
 
@@ -1462,14 +1364,12 @@ def find_pruneable_heads_and_indices(heads, n_heads, head_size,
         `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
     """
     mask = paddle.ones([n_heads, head_size])
-    heads = set(
-        heads
-    ) - already_pruned_heads  # Convert to set and remove already pruned heads
+    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
     for head in heads:
         # Compute how many pruned heads are before the head and move the index accordingly
         head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
         mask[head] = 0
     mask = mask.reshape(-1).eq(1)
     # index: torch.LongTensor = torch.arange(len(mask))[mask].long()
-    index = paddle.arange(len(mask))[mask].astype('int64')
+    index = paddle.arange(len(mask))[mask].astype("int64")
     return heads, index
diff --git a/paddlemix/models/blip2/configuration.py b/paddlemix/models/blip2/configuration.py
index a86d5d1908372..c978eeb0419bf 100644
--- a/paddlemix/models/blip2/configuration.py
+++ b/paddlemix/models/blip2/configuration.py
@@ -17,12 +17,7 @@
 import os
 from typing import Union
 
-from paddlenlp.transformers import AutoConfig
-from paddlenlp.transformers.auto.modeling import \
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
-from paddlenlp.transformers.opt.configuration import OPTConfig
-from paddlenlp.transformers.t5.configuration import T5Config
 from paddlenlp.utils.log import logger
 
 __all__ = [
@@ -82,18 +77,19 @@ class Blip2VisionConfig(PretrainedConfig):
     model_type = "blip_2_vision_model"
 
     def __init__(
-            self,
-            img_size=224,
-            patch_size=14,
-            embed_dim=1408,
-            depth=39,
-            num_heads=16,
-            mlp_ratio=4.3637,
-            qkv_bias=True,
-            drop_rate=0,
-            epsilon=1e-6,
-            gradient_checkpointing=False,
-            **kwargs, ):
+        self,
+        img_size=224,
+        patch_size=14,
+        embed_dim=1408,
+        depth=39,
+        num_heads=16,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_rate=0,
+        epsilon=1e-6,
+        gradient_checkpointing=False,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -108,26 +104,22 @@ def __init__(
         self.epsilon = epsilon
         self.gradient_checkpointing = gradient_checkpointing
 
-        self.in_chans = kwargs.get('in_chans', 3)
-        self.class_num = kwargs.get('class_num', 1000)
-        self.qk_scale = kwargs.get('qk_scale', None)
-        self.attn_drop_rate = kwargs.get('attn_drop_rate=', 0.)
-        self.drop_path_rate = kwargs.get('drop_path_rate', 0.)
-        self.norm_layer = kwargs.get('norm_layer', 'nn.LayerNorm')
+        self.in_chans = kwargs.get("in_chans", 3)
+        self.class_num = kwargs.get("class_num", 1000)
+        self.qk_scale = kwargs.get("qk_scale", None)
+        self.attn_drop_rate = kwargs.get("attn_drop_rate=", 0.0)
+        self.drop_path_rate = kwargs.get("drop_path_rate", 0.0)
+        self.norm_layer = kwargs.get("norm_layer", "nn.LayerNorm")
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "blip-2":
             config_dict = config_dict["vision_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -196,24 +188,25 @@ class Blip2QFormerConfig(PretrainedConfig):
     model_type = "blip_2_qformer"
 
     def __init__(
-            self,
-            vocab_size=30522,
-            hidden_size=768,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            intermediate_size=3072,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            initializer_range=0.02,
-            layer_norm_eps=1e-12,
-            pad_token_id=0,
-            position_embedding_type="absolute",
-            classifier_dropout=None,
-            cross_attention_frequency=2,
-            encoder_hidden_size=1408,
-            **kwargs, ):
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
@@ -234,18 +227,14 @@ def __init__(
         self.encoder_hidden_size = encoder_hidden_size
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "blip-2":
             config_dict = config_dict["qformer_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -300,31 +289,26 @@ class Blip2Config(PretrainedConfig):
     is_composition = True
 
     def __init__(
-            self,
-            vision_config=None,
-            qformer_config=None,
-            text_config=None,
-            num_query_tokens=32,
-            **kwargs, ):
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         if vision_config is None:
             vision_config = {}
-            logger.info(
-                "vision_config is None. initializing the Blip2VisionConfig with default values."
-            )
+            logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.")
 
         if qformer_config is None:
             qformer_config = {}
-            logger.info(
-                "qformer_config is None. Initializing the Blip2QFormerConfig with default values."
-            )
+            logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.")
 
         if text_config is None:
             text_config = {}
-            logger.info(
-                "text_config is None. Initializing the text config with default values (`OPTConfig`)."
-            )
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
         self.vision_config = vision_config
         self.qformer_config = qformer_config
         self.text_config = text_config
@@ -336,15 +320,16 @@ def __init__(
         # self.use_decoder_only_language_model = self.text_config.model_type in CONFIGURATION_MODEL_MAPPING
         self.initializer_factor = 1.0
         self.initializer_range = 0.02
-        self.freeze_vit = kwargs.get('freeze_vit', True)
+        self.freeze_vit = kwargs.get("freeze_vit", True)
 
     @classmethod
     def from_vision_qformer_text_configs(
-            cls,
-            vision_config: Blip2VisionConfig,
-            qformer_config: Blip2QFormerConfig,
-            text_config: PretrainedConfig,
-            **kwargs, ):
+        cls,
+        vision_config: Blip2VisionConfig,
+        qformer_config: Blip2QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
         r"""
         Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
         configurations.
@@ -356,7 +341,8 @@ def from_vision_qformer_text_configs(
             vision_config=vision_config,
             qformer_config=qformer_config,
             text_config=text_config,
-            **kwargs, )
+            **kwargs,
+        )
 
     def to_dict(self):
         """
diff --git a/paddlemix/models/blip2/eva_vit.py b/paddlemix/models/blip2/eva_vit.py
index bdbfa337377fe..d7ef6525c8a55 100644
--- a/paddlemix/models/blip2/eva_vit.py
+++ b/paddlemix/models/blip2/eva_vit.py
@@ -12,26 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-# reference: https://arxiv.org/abs/2010.11929
-from paddlemix.utils.log import logger
 from collections.abc import Callable
-from paddle.distributed import fleet
-from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+
 import numpy as np
 import paddle
 import paddle.nn as nn
 from paddle import _legacy_C_ops
-from paddle.nn.initializer import TruncatedNormal, Constant, Normal
 from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.nn.functional.flash_attention import flash_attention
+from paddle.nn.initializer import Constant, Normal, TruncatedNormal
+
 from paddlemix.models.blip2.configuration import Blip2VisionConfig
 from paddlemix.models.blip2.modeling import Blip2PretrainedModel
-from paddle.nn.functional.flash_attention import (flash_attention, )
 
-trunc_normal_ = TruncatedNormal(std=.02)
+# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# reference: https://arxiv.org/abs/2010.11929
+from paddlemix.utils.log import logger
+
+trunc_normal_ = TruncatedNormal(std=0.02)
 normal_ = Normal
-zeros_ = Constant(value=0.)
-ones_ = Constant(value=1.)
+zeros_ = Constant(value=0.0)
+ones_ = Constant(value=1.0)
 from paddle.distributed.fleet.utils import recompute
 
 
@@ -39,12 +41,12 @@ def to_2tuple(x):
     return tuple([x] * 2)
 
 
-def drop_path(x, drop_prob=0., training=False):
+def drop_path(x, drop_prob=0.0, training=False):
 
-    if drop_prob == 0. or not training:
+    if drop_prob == 0.0 or not training:
         return x
     keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
-    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    shape = (paddle.shape(x)[0],) + (1,) * (x.ndim - 1)
     random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
     random_tensor = paddle.floor(random_tensor)  # binarize
     output = x.divide(keep_prob) * random_tensor
@@ -61,37 +63,31 @@ def forward(self, x):
 
 
 class Mlp(nn.Layer):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.,
-                 mp_degree=1,
-                 use_fusedlinear=False):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+        mp_degree=1,
+        use_fusedlinear=False,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         if mp_degree > 1:
             self.fc1 = fleet.meta_parallel.ColumnParallelLinear(
-                in_features,
-                hidden_features,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True)
+                in_features, hidden_features, weight_attr=None, has_bias=True, gather_output=True
+            )
             self.fc2 = fleet.meta_parallel.ColumnParallelLinear(
-                hidden_features,
-                out_features,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True)
+                hidden_features, out_features, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             if use_fusedlinear:
                 self.use_fusedlinear = True
-                self.fc1 = paddle.incubate.nn.FusedLinear(in_features,
-                                                          hidden_features)
-                self.fc2 = paddle.incubate.nn.FusedLinear(hidden_features,
-                                                          out_features)
+                self.fc1 = paddle.incubate.nn.FusedLinear(in_features, hidden_features)
+                self.fc2 = paddle.incubate.nn.FusedLinear(hidden_features, out_features)
             self.fc1 = nn.Linear(in_features, hidden_features)
             self.fc2 = nn.Linear(hidden_features, out_features)
         self.mp_degree = mp_degree
@@ -102,12 +98,12 @@ def forward(self, x):
         if getattr(self, "use_fusedlinear", False):
             if isinstance(self.act, nn.GELU):
                 x = _legacy_C_ops.fused_gemm_epilogue(
-                    x, self.fc1.weight, self.fc1.bias, 'trans_x', False,
-                    'trans_y', False, 'activation', 'gelu')
+                    x, self.fc1.weight, self.fc1.bias, "trans_x", False, "trans_y", False, "activation", "gelu"
+                )
             elif isinstance(self.act, nn.ReLU):
                 x = _legacy_C_ops.fused_gemm_epilogue(
-                    x, self.fc1.weight, self.fc1.bias, 'trans_x', False,
-                    'trans_y', False, 'activation', 'relu')
+                    x, self.fc1.weight, self.fc1.bias, "trans_x", False, "trans_y", False, "activation", "relu"
+                )
             else:
                 ValueError
         else:
@@ -123,17 +119,19 @@ def forward(self, x):
 
 
 class Attention(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 window_size=None,
-                 mp_degree=1,
-                 use_fusedlinear=False,
-                 use_flash_attn=False):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        mp_degree=1,
+        use_fusedlinear=False,
+        use_flash_attn=False,
+    ):
         super().__init__()
         self.use_flash_attn = use_flash_attn
         self.num_heads = num_heads
@@ -141,21 +139,18 @@ def __init__(self,
         self.scale = qk_scale or head_dim**-0.5
         if mp_degree > 1:
             self.qkv = fleet.meta_parallel.ColumnParallelLinear(
-                dim,
-                dim * 3,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True)
+                dim, dim * 3, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             if use_fusedlinear:
-                self.qkv = paddle.incubate.nn.FusedLinear(
-                    dim, dim * 3, bias_attr=qkv_bias)
+                self.qkv = paddle.incubate.nn.FusedLinear(dim, dim * 3, bias_attr=qkv_bias)
             else:
                 self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
         self.attn_drop = nn.Dropout(attn_drop)
         if mp_degree > 1:
             self.proj = fleet.meta_parallel.ColumnParallelLinear(
-                dim, dim, weight_attr=None, has_bias=True, gather_output=True)
+                dim, dim, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             if use_fusedlinear:
                 self.proj = paddle.incubate.nn.FusedLinear(dim, dim)
@@ -165,31 +160,25 @@ def __init__(self,
         self.proj_drop = nn.Dropout(proj_drop)
 
     def _register_relative_position_index(
-            self,
-            window_size,
-            num_heads, ):
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1) + 3
+        self,
+        window_size,
+        num_heads,
+    ):
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
         self.relative_position_bias_table = self.create_parameter(
-            [self.num_relative_distance, num_heads],
-            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            [self.num_relative_distance, num_heads], default_initializer=zeros_
+        )  # 2*Wh-1 * 2*Ww-1, nH
         coords_h = paddle.arange(window_size[0])
         coords_w = paddle.arange(window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
         coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :,
-                                         None] - coords_flatten[:,
-                                                                None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.transpose(
-            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
         relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
         relative_coords[:, :, 1] += window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = \
-            paddle.zeros((window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
-        relative_position_index[1:, 1:] = relative_coords.sum(
-            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index = paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
         relative_position_index[0, 0:] = self.num_relative_distance - 3
         relative_position_index[0:, 0] = self.num_relative_distance - 2
         relative_position_index[0, 0] = self.num_relative_distance - 1
@@ -198,28 +187,20 @@ def _register_relative_position_index(
 
     def forward(self, x, rel_pos_bias=None):
         N, C = x.shape[1:]
-        qkv = self.qkv(x).reshape(
-            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
-                (2, 0, 3, 1, 4))
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads)).transpose((2, 0, 3, 1, 4))
         q, k, v = qkv[0], qkv[1], qkv[2]
         if self.use_flash_attn:
-            x, _ = flash_attention(
-                q,
-                k,
-                v,
-                dropout=self.proj_drop.p,
-                causal=False,
-                return_softmax=False)
+            x, _ = flash_attention(q, k, v, dropout=self.proj_drop.p, causal=False, return_softmax=False)
             x = paddle.reshape(x, [0, 0, -1])
         else:
             attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
-            if hasattr(self, 'relative_position_bias_table'):
-                relative_position_bias = \
-                    self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
-                        self.window_size[0] * self.window_size[1] + 1,
-                        self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
-                relative_position_bias = relative_position_bias.transpose(
-                    [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+            if hasattr(self, "relative_position_bias_table"):
+                relative_position_bias = self.relative_position_bias_table[
+                    self.relative_position_index.reshape([-1])
+                ].reshape(
+                    [self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1]
+                )  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
                 attn = attn + relative_position_bias.unsqueeze(0)
 
             attn = nn.functional.softmax(attn, axis=-1)
@@ -242,31 +223,32 @@ def forward(self, x, rel_pos_bias=None):
 
 
 class Block(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 init_values=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer='nn.LayerNorm',
-                 epsilon=1e-5,
-                 window_size=None,
-                 mp_degree=1,
-                 use_flash_attn=False,
-                 use_fusedlinear=False):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        init_values=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer="nn.LayerNorm",
+        epsilon=1e-5,
+        window_size=None,
+        mp_degree=1,
+        use_flash_attn=False,
+        use_fusedlinear=False,
+    ):
         super().__init__()
         if isinstance(norm_layer, str):
             self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
         elif isinstance(norm_layer, Callable):
             self.norm1 = norm_layer(dim)
         else:
-            raise TypeError(
-                "The norm_layer must be str or paddle.nn.layer.Layer class")
+            raise TypeError("The norm_layer must be str or paddle.nn.layer.Layer class")
         self.attn = Attention(
             dim,
             num_heads=num_heads,
@@ -277,7 +259,8 @@ def __init__(self,
             window_size=window_size,
             mp_degree=mp_degree,
             use_flash_attn=use_flash_attn,
-            use_fusedlinear=use_fusedlinear)
+            use_fusedlinear=use_fusedlinear,
+        )
         # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
         self.drop_path = DropPath(drop_path)
         self.gamma_1 = None
@@ -287,25 +270,23 @@ def __init__(self,
         elif isinstance(norm_layer, Callable):
             self.norm2 = norm_layer(dim)
         else:
-            raise TypeError(
-                "The norm_layer must be str or paddle.nn.layer.Layer class")
+            raise TypeError("The norm_layer must be str or paddle.nn.layer.Layer class")
         mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop,
-                       mp_degree=mp_degree,
-                       use_fusedlinear=use_fusedlinear)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            mp_degree=mp_degree,
+            use_fusedlinear=use_fusedlinear,
+        )
 
     def forward(self, x, rel_pos_bias=None):
         if self.gamma_1 is not None:
-            x = x + self.drop_path(self.gamma_1 * self.attn(
-                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
             x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
         else:
-            x = x + self.drop_path(
-                self.attn(
-                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
             x = x + self.drop_path(self.mlp(self.norm2(x)))
         return x
 
@@ -314,31 +295,24 @@ class RelativePositionBias(nn.Layer):
     def __init__(self, window_size, num_heads):
         super().__init__()
         self.window_size = window_size
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1) + 3
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
         self.relative_position_bias_table = self.create_parameter(
-            [self.num_relative_distance, num_heads],
-            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            [self.num_relative_distance, num_heads], default_initializer=zeros_
+        )  # 2*Wh-1 * 2*Ww-1, nH
         # cls to token & token 2 cls & cls to cls
 
         # get pair-wise relative position index for each token inside the window
         coords_h = paddle.arange(window_size[0])
         coords_w = paddle.arange(window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
         coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :,
-                                         None] - coords_flatten[:,
-                                                                None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.transpose(
-            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
         relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
         relative_coords[:, :, 1] += window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = \
-            paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
-        relative_position_index[1:, 1:] = relative_coords.sum(
-            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index = paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
         relative_position_index[0, 0:] = self.num_relative_distance - 3
         relative_position_index[0:, 0] = self.num_relative_distance - 2
         relative_position_index[0, 0] = self.num_relative_distance - 1
@@ -348,42 +322,39 @@ def __init__(self, window_size, num_heads):
         # trunc_normal_(self.relative_position_bias_table, std=.02)
 
     def forward(self):
-        relative_position_bias = \
-            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
-                self.window_size[0] * self.window_size[1] + 1,
-                self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape(
+            [self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1]
+        )  # Wh*Ww,Wh*Ww,nH
         return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
 
 
 class PatchEmbed(nn.Layer):
-    """ Image to Patch Embedding
-    """
+    """Image to Patch Embedding"""
 
     def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
         super().__init__()
         img_size = to_2tuple(img_size)
         patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * \
-            (img_size[0] // patch_size[0])
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
         self.img_size = img_size
         self.patch_size = patch_size
         self.num_patches = num_patches
 
-        self.proj = nn.Conv2D(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
 
     def forward(self, x):
         B, C, H, W = x.shape
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
 
         x = self.proj(x).flatten(2).transpose((0, 2, 1))
         return x
 
 
 class VisionTransformer(Blip2PretrainedModel):
-    """ Vision Transformer with support for patch input
-    """
+    """Vision Transformer with support for patch input"""
+
     main_input_name = "pixel_values"
     config_class = Blip2VisionConfig
 
@@ -396,47 +367,49 @@ def __init__(self, config: Blip2VisionConfig, **kwargs):
         self.num_features = self.embed_dim = config.embed_dim
         _img_size = to_2tuple(config.img_size)
         _patch_size = to_2tuple(config.patch_size)
-        self.window_size = (_img_size[0] // _patch_size[0],
-                            _img_size[1] // _patch_size[1])
+        self.window_size = (_img_size[0] // _patch_size[0], _img_size[1] // _patch_size[1])
         self.patch_embed = PatchEmbed(
             img_size=config.img_size,
             patch_size=config.patch_size,
             in_chans=config.in_chans,
-            embed_dim=config.embed_dim)
+            embed_dim=config.embed_dim,
+        )
         num_patches = self.patch_embed.num_patches
-        self.cls_token = self.create_parameter(
-            shape=(1, 1, config.embed_dim), default_initializer=zeros_)
+        self.cls_token = self.create_parameter(shape=(1, 1, config.embed_dim), default_initializer=zeros_)
 
         self.pos_embed = self.create_parameter(
-            shape=(1, num_patches + 1, config.embed_dim),
-            default_initializer=zeros_)
+            shape=(1, num_patches + 1, config.embed_dim), default_initializer=zeros_
+        )
 
         self.add_parameter("pos_embed", self.pos_embed)
 
         self.add_parameter("cls_token", self.cls_token)
         self.pos_drop = nn.Dropout(p=config.drop_rate)
         self.gradient_checkpointing = config.gradient_checkpointing
-        logger.info("self.gradient_checkpointing:{}".format(
-            self.gradient_checkpointing))
+        logger.info("self.gradient_checkpointing:{}".format(self.gradient_checkpointing))
         dpr = np.linspace(0, config.drop_path_rate, config.depth)
 
-        self.blocks = nn.LayerList([
-            Block(
-                dim=config.embed_dim,
-                num_heads=config.num_heads,
-                mlp_ratio=config.mlp_ratio,
-                qkv_bias=config.qkv_bias,
-                qk_scale=config.qk_scale,
-                drop=config.drop_rate,
-                attn_drop=config.attn_drop_rate,
-                drop_path=dpr[i],
-                norm_layer=config.norm_layer,
-                epsilon=config.epsilon,
-                window_size=self.window_size,
-                mp_degree=mp_degree,
-                use_flash_attn=use_flash_attn,
-                use_fusedlinear=use_fusedlinear) for i in range(config.depth)
-        ])
+        self.blocks = nn.LayerList(
+            [
+                Block(
+                    dim=config.embed_dim,
+                    num_heads=config.num_heads,
+                    mlp_ratio=config.mlp_ratio,
+                    qkv_bias=config.qkv_bias,
+                    qk_scale=config.qk_scale,
+                    drop=config.drop_rate,
+                    attn_drop=config.attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=config.norm_layer,
+                    epsilon=config.epsilon,
+                    window_size=self.window_size,
+                    mp_degree=mp_degree,
+                    use_flash_attn=use_flash_attn,
+                    use_fusedlinear=use_fusedlinear,
+                )
+                for i in range(config.depth)
+            ]
+        )
 
         self.mp_degree = mp_degree
         if self.pos_embed is not None:
@@ -447,9 +420,7 @@ def __init__(self, config: Blip2VisionConfig, **kwargs):
     def _init_weights(self, m):
         if isinstance(m, (nn.Linear, fleet.meta_parallel.ColumnParallelLinear)):
             trunc_normal_(m.weight)
-            if isinstance(m,
-                          (nn.Linear, fleet.meta_parallel.ColumnParallelLinear
-                           )) and m.bias is not None:
+            if isinstance(m, (nn.Linear, fleet.meta_parallel.ColumnParallelLinear)) and m.bias is not None:
                 zeros_(m.bias)
         elif isinstance(m, nn.LayerNorm):
             zeros_(m.bias)
@@ -469,15 +440,14 @@ def forward_features(self, x):
                 x = self.pos_drop(x)
         else:
             x = self.pos_drop(x)
-        rel_pos_bias = self.rel_pos_bias() if hasattr(self,
-                                                      'rel_pos_bias') else None
+        rel_pos_bias = self.rel_pos_bias() if hasattr(self, "rel_pos_bias") else None
         for blk in self.blocks:
             if self.gradient_checkpointing and self.training:
 
                 x = recompute(blk, x, rel_pos_bias=rel_pos_bias)
             else:
                 x = blk(x, rel_pos_bias=rel_pos_bias)
-        #x = self.norm(x)
+        # x = self.norm(x)
         return x
 
     def forward(self, x):
@@ -486,60 +456,47 @@ def forward(self, x):
 
 
 def interpolate_pos_embed(model, checkpoint_model):
-    if 'visual_encoder.pos_embed' in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model['visual_encoder.pos_embed']
+    if "visual_encoder.pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["visual_encoder.pos_embed"]
         embedding_size = pos_embed_checkpoint.shape[-1]
         num_patches = model.visual_encoder.patch_embed.num_patches
-        num_extra_tokens = model.visual_encoder.pos_embed.shape[
-            -2] - num_patches
+        num_extra_tokens = model.visual_encoder.pos_embed.shape[-2] - num_patches
         # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
-                        0.5)
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
         # height (== width) for the new position embedding
         new_size = int(num_patches**0.5)
         # class_token and dist_token are kept unchanged
         if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" %
-                  (orig_size, orig_size, new_size, new_size))
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
             extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
             # only the position tokens are interpolated
             pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(
-                (-1, orig_size, orig_size, embedding_size)).transpose(
-                    (0, 3, 1, 2))
+            pos_tokens = pos_tokens.reshape((-1, orig_size, orig_size, embedding_size)).transpose((0, 3, 1, 2))
             pos_tokens = paddle.nn.functional.interpolate(
-                pos_tokens,
-                size=(new_size, new_size),
-                mode='bicubic',
-                align_corners=False)
+                pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False
+            )
             pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2)
             new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
-            checkpoint_model['visual_encoder.pos_embed'] = new_pos_embed
-    elif 'pos_embed' in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model['pos_embed']
+            checkpoint_model["visual_encoder.pos_embed"] = new_pos_embed
+    elif "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
         embedding_size = pos_embed_checkpoint.shape[-1]
         num_patches = model.patch_embed.num_patches
         num_extra_tokens = model.pos_embed.shape[-2] - num_patches
         # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
-                        0.5)
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
         # height (== width) for the new position embedding
         new_size = int(num_patches**0.5)
         # class_token and dist_token are kept unchanged
         if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" %
-                  (orig_size, orig_size, new_size, new_size))
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
             extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
             # only the position tokens are interpolated
             pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(
-                (-1, orig_size, orig_size, embedding_size)).transpose(
-                    (0, 3, 1, 2))
+            pos_tokens = pos_tokens.reshape((-1, orig_size, orig_size, embedding_size)).transpose((0, 3, 1, 2))
             pos_tokens = paddle.nn.functional.interpolate(
-                pos_tokens,
-                size=(new_size, new_size),
-                mode='bicubic',
-                align_corners=False)
+                pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False
+            )
             pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2)
             new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
-            checkpoint_model['pos_embed'] = new_pos_embed
+            checkpoint_model["pos_embed"] = new_pos_embed
diff --git a/paddlemix/models/blip2/modeling.py b/paddlemix/models/blip2/modeling.py
index caeb250df69da..0cfb3c7daa98c 100644
--- a/paddlemix/models/blip2/modeling.py
+++ b/paddlemix/models/blip2/modeling.py
@@ -13,43 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle BLIP2 model."""
-from paddlemix.utils.log import logger
-import math
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 
 import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.distributed.fleet.utils import recompute
 import paddle.distributed as dist
-
+import paddle.nn as nn
+from paddlenlp.transformers import AutoTokenizer
 from paddlenlp.transformers.model_outputs import ModelOutput
 from paddlenlp.transformers.model_utils import PretrainedModel
-
-from paddlemix.models.blip2.modeling_opt import OPTForCausalLM
-from paddlenlp.transformers.t5.configuration import T5Config
 from paddlenlp.transformers.t5.modeling import T5ForConditionalGeneration
 from paddlenlp.utils.initializer import normal_, ones_, zeros_
-from paddlenlp.utils.log import logger
-from .configuration import Blip2Config
+
+from paddlemix.models.blip2.modeling_opt import OPTForCausalLM
+from paddlemix.models.blip2.modeling_utils import (
+    all_gather_with_grad,
+    concat_all_gather,
+    disabled_train,
+    masked_fill,
+)
 from paddlemix.models.blip2.Qformer import BertLMHeadModel
-from paddlenlp.transformers import AutoTokenizer
-from paddlemix.models.blip2.modeling_utils import disabled_train, all_gather_with_grad, concat_all_gather, masked_fill
+from paddlemix.utils.log import logger
+
+from .configuration import Blip2Config
 
 BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "Salesforce/blip2-flan-t5-xl",
     "Salesforce/blip2-opt-2.7b",
 ]
 
-__all__ = ["Blip2ForConditionalGeneration", ]
+__all__ = [
+    "Blip2ForConditionalGeneration",
+]
 
 
 def Parameter(tensor):
     return paddle.create_parameter(
         tensor.shape,
         dtype=tensor.dtype,
-        default_initializer=nn.initializer.Assign(tensor), )
+        default_initializer=nn.initializer.Assign(tensor),
+    )
 
 
 @dataclass
@@ -77,9 +80,11 @@ class Blip2ForConditionalGenerationModelOutput(ModelOutput):
 
     def to_tuple(self) -> Tuple[Any]:
         return tuple(
-            self[k] if k not in
-            ["vision_outputs", "qformer_outputs", "language_model_outputs"] else
-            getattr(self, k).to_tuple() for k in self.keys())
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
 
 
 @dataclass
@@ -87,6 +92,7 @@ class Blip2ForStage1ModelOutput(Blip2ForConditionalGenerationModelOutput):
     """
     Class defining the outputs of [`Blip2ForStage1ModelOutput`].
     """
+
     loss: Optional[Tuple[paddle.Tensor]] = None
     loss_itc: Optional[Tuple[paddle.Tensor]] = None
     loss_itm: Optional[paddle.Tensor] = None
@@ -113,8 +119,7 @@ class Blip2PretrainedModel(PretrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_range
-        if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or
-                isinstance(module, nn.Linear)):
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
             normal_(module.weight, mean=0.0, std=factor)
             if hasattr(module, "bias") and module.bias is not None:
                 zeros_(module.bias)
@@ -132,12 +137,9 @@ def init_tokenizer(cls, tokenizer_name="bert-base-uncased"):
         return tokenizer
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path,
-                        from_hf_hub: bool=False,
-                        subfolder: str=None,
-                        *args,
-                        **kwargs):
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str = None, *args, **kwargs
+    ):
         """
         Creates an instance of `PretrainedModel`. Model weights are loaded
         by specifying name of a built-in pretrained model, a pretrained model from HF Hub, a community contributed model,
@@ -192,18 +194,22 @@ def from_pretrained(cls,
                 model = BertForSequenceClassification.from_pretrained('./my_bert/'
         """
         import os
+
+        from paddlenlp.transformers.configuration_utils import PretrainedConfig
+        from paddlenlp.transformers.model_utils import load_state_dict, no_init_weights
         from paddlenlp.transformers.utils import (
             ContextManagers,
+            device_guard,
             is_paddle_support_lazy_init,
             is_safetensors_available,
             resolve_cache_dir,
-            device_guard, )
-        from paddlenlp.transformers.configuration_utils import PretrainedConfig
+        )
         from paddlenlp.utils.env import (
             CONFIG_NAME,
             PADDLE_WEIGHTS_NAME,
-            PYTORCH_WEIGHTS_NAME, )
-        from paddlenlp.transformers.model_utils import no_init_weights, load_state_dict
+            PYTORCH_WEIGHTS_NAME,
+        )
+
         config = kwargs.pop("config", None)
         state_dict = kwargs.pop("state_dict", None)
         cache_dir = kwargs.pop("cache_dir", None)
@@ -212,16 +218,14 @@ def from_pretrained(cls,
         dtype = kwargs.pop("dtype", None)
         subfolder = kwargs.pop("subfolder", "")
         variant = kwargs.pop("variant", None)
-        use_safetensors = kwargs.pop("use_safetensors", None
-                                     if is_safetensors_available() else False)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
 
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
         convert_from_torch = kwargs.pop("convert_from_torch", None)
         load_state_as_np = kwargs.pop("load_state_as_np", None)
         mp_degree = kwargs.pop("mp_degree", 1)
         if load_state_as_np is not None:
-            logger.warning(
-                "`load_state_as_np` is deprecated,  please delete it!")
+            logger.warning("`load_state_as_np` is deprecated,  please delete it!")
 
         model_kwargs = kwargs
 
@@ -236,8 +240,7 @@ def from_pretrained(cls,
         if convert_from_torch is None:
             convert_from_torch = False
 
-        cache_dir = resolve_cache_dir(pretrained_model_name_or_path,
-                                      from_hf_hub, cache_dir)
+        cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
         # 1. get the PretrainedConfig to init model
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else pretrained_model_name_or_path
@@ -248,14 +251,14 @@ def from_pretrained(cls,
                 force_download=force_download,
                 from_hf_hub=from_hf_hub,
                 subfolder=subfolder,
-                **kwargs, )
+                **kwargs,
+            )
         if not os.path.exists(os.path.join(cache_dir, CONFIG_NAME)):
             config.save_pretrained(cache_dir)
 
         # refine options for config
         config.mp_degree = mp_degree
-        convert_from_torch = cls.support_conversion(
-            config) and convert_from_torch
+        convert_from_torch = cls.support_conversion(config) and convert_from_torch
 
         if dtype is None:
             dtype = config.dtype
@@ -285,7 +288,8 @@ def from_pretrained(cls,
             config=config,
             convert_from_torch=convert_from_torch,
             use_safetensors=use_safetensors,
-            variant=variant, )
+            variant=variant,
+        )
 
         # load pt weights early so that we know which dtype to init the model under
         if not is_sharded and state_dict is None:
@@ -297,8 +301,7 @@ def from_pretrained(cls,
                         f"Starting to convert pytorch weight file<{resolved_archive_file}> to "
                         f"paddle weight file<{os.path.join(cache_dir, PADDLE_WEIGHTS_NAME)}> ..."
                     )
-                    state_dict = cls.convert(resolved_archive_file, config,
-                                             cache_dir)
+                    state_dict = cls.convert(resolved_archive_file, config, cache_dir)
                 else:
                     raise ValueError(
                         f"download the {PYTORCH_WEIGHTS_NAME} weight file, but model<{cls}> "
@@ -306,19 +309,15 @@ def from_pretrained(cls,
                     )
             else:
                 # 4. loading non-sharded ckpt from the state dict
-                if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith(
-                        "model_state.pdparams"):
-                    state_dict = cls.convert_tensor_parallel(
-                        resolved_archive_file, config)
+                if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model_state.pdparams"):
+                    state_dict = cls.convert_tensor_parallel(resolved_archive_file, config)
                 else:
                     state_dict = load_state_dict(resolved_archive_file)
 
-                logger.info(
-                    "Loaded weights file from disk, setting weights to model.")
+                logger.info("Loaded weights file from disk, setting weights to model.")
 
         # Check if `_keep_in_fp32_modules` is not None
-        use_keep_in_fp32_modules = (
-            cls._keep_in_fp32_modules is not None) and dtype == "float16"
+        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and dtype == "float16"
 
         if is_sharded:
             loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
@@ -333,14 +332,14 @@ def from_pretrained(cls,
             for k in list(state_dict.keys()):
                 if not isinstance(state_dict[k], paddle.Tensor):
                     with device_guard():
-                        state_dict[k] = paddle.Tensor(
-                            state_dict.pop(k), zero_copy=True)
+                        state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
 
         # 3. init the model
         init_args = config["init_args"] or ()
         with ContextManagers(init_contexts):
             model = cls(config, *init_args, **model_kwargs)
         from paddlemix.models.blip2.eva_vit import interpolate_pos_embed
+
         interpolate_pos_embed(model, state_dict)
         if use_keep_in_fp32_modules:
             # low_cpu_mem_usage = True
@@ -358,7 +357,8 @@ def from_pretrained(cls,
             ignore_mismatched_sizes=ignore_mismatched_sizes,
             low_cpu_mem_usage=low_cpu_mem_usage,
             dtype=dtype,
-            keep_in_fp32_modules=keep_in_fp32_modules, )
+            keep_in_fp32_modules=keep_in_fp32_modules,
+        )
 
         if paddle.in_dynamic_mode():
             return model
@@ -376,13 +376,15 @@ class Blip2ForConditionalGeneration(Blip2PretrainedModel):
     ]
 
     def __init__(
-            self,
-            config: Blip2Config, ):
+        self,
+        config: Blip2Config,
+    ):
         super().__init__(config)
         from paddlemix.models.blip2.eva_vit import VisionTransformer
+
         self.visual_encoder = VisionTransformer.from_pretrained(
-            pretrained_model_name_or_path=config.vision_config,
-            mp_degree=config.mp_degree)
+            pretrained_model_name_or_path=config.vision_config, mp_degree=config.mp_degree
+        )
         self.freeze_vit = config.freeze_vit
         self.train_stage1 = False
         if self.freeze_vit:
@@ -400,33 +402,32 @@ def __init__(
                 encoder_width=self.visual_encoder.num_features,
                 train_in_satge1=True,
                 tokenizer_length=len(self.tokenizer),
-                mp_degree=config.mp_degree)
+                mp_degree=config.mp_degree,
+            )
 
             state_dict = self.Qformer.state_dict()
             for name, param in self.Qformer.named_parameters():
-                if '_query' in name:
-                    key_orig = name.replace('_query', '')
-                    param.copy_(state_dict[key_orig], False)  ### problem
+                if "_query" in name:
+                    key_orig = name.replace("_query", "")
+                    param.copy_(state_dict[key_orig], False)
 
             self.temp = self.create_parameter(
-                shape=(1, ),
-                default_initializer=paddle.nn.initializer.Constant(value=0.07))
+                shape=(1,), default_initializer=paddle.nn.initializer.Constant(value=0.07)
+            )
             self.max_txt_len = config.get("max_txt_len")
         else:
             if config.use_decoder_only_language_model:
                 if "opt" in config.text_config:
                     language_model = OPTForCausalLM.from_pretrained(
-                        config.text_config,
-                        load_state_as_np=True,
-                        mp_degree=config.mp_degree)
+                        config.text_config, load_state_as_np=True, mp_degree=config.mp_degree
+                    )
                 else:
                     raise NotImplementedError
             else:
                 if "t5" in config.text_config:
                     language_model = T5ForConditionalGeneration(
-                        config.text_config,
-                        load_state_as_np=True,
-                        mp_degree=config.mp_degree)
+                        config.text_config, load_state_as_np=True, mp_degree=config.mp_degree
+                    )
                 else:
                     raise NotImplementedError
 
@@ -441,7 +442,8 @@ def __init__(
                 train_in_satge1=False,
                 text_hidden_size=self.language_model.hidden_size,
                 ignore_mismatched_sizes=True,
-                mp_degree=config.mp_degree)
+                mp_degree=config.mp_degree,
+            )
             self.Qformer.cls = None
             self.Qformer.bert.embeddings.word_embeddings = None
             self.Qformer.bert.embeddings.position_embeddings = None
@@ -452,13 +454,15 @@ def __init__(
     def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
-    def forward(self,
-                pixel_values: paddle.Tensor,
-                input_ids: paddle.Tensor=None,
-                attention_mask: Optional[paddle.Tensor]=None,
-                return_dict: Optional[bool]=None,
-                text_input_stage1: Optional[paddle.Tensor]=None,
-                **kwargs):
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        text_input_stage1: Optional[paddle.Tensor] = None,
+        **kwargs
+    ):
 
         if self.train_stage1:
             return self.forward_stage1(pixel_values, text_input_stage1)
@@ -467,15 +471,17 @@ def forward(self,
                 pixel_values,
                 input_ids,
                 attention_mask,
-                return_dict, )
+                return_dict,
+            )
 
     def forward_stage2(
-            self,
-            pixel_values: paddle.Tensor,
-            input_ids: paddle.Tensor,
-            attention_mask: Optional[paddle.Tensor]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
         r"""
         Returns:
         Examples:
@@ -516,54 +522,50 @@ def forward_stage2(
         >>> print(generated_text)
         two
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-        with paddle.amp.auto_cast(level='O2'):
-            image_embeds = self.Qformer.ln_vision(
-                self.visual_encoder(pixel_values))
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        with paddle.amp.auto_cast(level="O2"):
+            image_embeds = self.Qformer.ln_vision(self.visual_encoder(pixel_values))
         image_embeds = image_embeds.astype("float32")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
-        query_tokens = self.Qformer.query_tokens.expand(
-            [image_embeds.shape[0], -1, -1])
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+        query_tokens = self.Qformer.query_tokens.expand([image_embeds.shape[0], -1, -1])
         query_outputs = self.Qformer.bert(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs[0]
 
         # step 3: use the language model, conditioned on the query outputs and the prompt
         language_model_inputs = self.Qformer.language_projection(query_output)
-        language_model_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = paddle.concat(
-            [language_model_inputs, inputs_embeds], axis=1)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
         if attention_mask is None:
             attention_mask = paddle.ones_like(input_ids)
 
-        attention_mask = paddle.concat(
-            [language_model_attention_mask, attention_mask], axis=1)
+        attention_mask = paddle.concat([language_model_attention_mask, attention_mask], axis=1)
 
-        targets = input_ids * (1 - (
-            input_ids == self.pad_token_id).astype(input_ids.dtype)) + (
-                input_ids == self.pad_token_id).astype(input_ids.dtype) * (-100)
+        targets = input_ids * (1 - (input_ids == self.pad_token_id).astype(input_ids.dtype)) + (
+            input_ids == self.pad_token_id
+        ).astype(input_ids.dtype) * (-100)
 
-        empty_targets = paddle.ones(
-            language_model_attention_mask.shape, dtype="int64").fill_(-100)
+        empty_targets = paddle.ones(language_model_attention_mask.shape, dtype="int64").fill_(-100)
         labels = paddle.concat([empty_targets, targets], axis=1)
         labels.stop_gradient = True
-        with paddle.amp.auto_cast(level='O2'):
+        with paddle.amp.auto_cast(level="O2"):
             outputs = self.language_model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,
                 return_dict=True,
-                labels=labels, )
+                labels=labels,
+            )
             loss = outputs.loss
-        return Blip2ForConditionalGenerationModelOutput(loss=loss, )
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+        )
 
     def forward_stage1(self, pixel_values, text_input):
         text = text_input
@@ -572,80 +574,67 @@ def forward_stage1(self, pixel_values, text_input):
         image_embeds = self.Qformer.ln_vision(self.visual_encoder(image))
 
         image_atts = paddle.ones(image_embeds.shape[:-1], dtype="int64")
-        query_tokens = self.Qformer.query_tokens.expand(
-            shape=[image_embeds.shape[0], -1, -1])
+        query_tokens = self.Qformer.query_tokens.expand(shape=[image_embeds.shape[0], -1, -1])
         query_output = self.Qformer.bert(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_atts,
             use_cache=True,
-            return_dict=True)
+            return_dict=True,
+        )
         image_feats = paddle.nn.functional.normalize(
-            x=self.Qformer.vision_proj(query_output.last_hidden_state), axis=-1)
+            x=self.Qformer.vision_proj(query_output.last_hidden_state), axis=-1
+        )
 
         text_tokens = self.tokenizer(
             text,
-            padding='max_length',
+            padding="max_length",
             truncation=True,
             max_length=self.max_txt_len,
             return_attention_mask=True,
-            return_tensors="pd")
+            return_tensors="pd",
+        )
         text_output = self.Qformer.bert(
-            text_tokens.input_ids,
-            attention_mask=text_tokens.attention_mask,
-            return_dict=True)
+            text_tokens.input_ids, attention_mask=text_tokens.attention_mask, return_dict=True
+        )
         text_feat = paddle.nn.functional.normalize(
-            self.Qformer.text_proj(text_output.last_hidden_state[:, 0, :]),
-            axis=-1)
+            self.Qformer.text_proj(text_output.last_hidden_state[:, 0, :]), axis=-1
+        )
 
-        ###============== Image-text Contrastive ===================###
+        # Image-text Contrastive
         # image_feats_all = image_feats
         # text_feat_all = text_feat
         image_feats_all = concat_all_gather(image_feats)
         text_feat_all = concat_all_gather(text_feat)
-        sim_q2t = paddle.matmul(
-            image_feats.unsqueeze(axis=1),
-            text_feat_all.unsqueeze(axis=-1)).squeeze()
+        sim_q2t = paddle.matmul(image_feats.unsqueeze(axis=1), text_feat_all.unsqueeze(axis=-1)).squeeze()
         sim_i2t = sim_q2t.max(axis=-1)
         sim_i2t = sim_i2t / self.temp
         sim_t2q = paddle.matmul(
-            x=text_feat.unsqueeze(axis=1).unsqueeze(axis=1),
-            y=image_feats_all.transpose(perm=[0, 2, 1])).squeeze()
+            x=text_feat.unsqueeze(axis=1).unsqueeze(axis=1), y=image_feats_all.transpose(perm=[0, 2, 1])
+        ).squeeze()
         sim_t2i = sim_t2q.max(axis=-1)
         sim_t2i = sim_t2i / self.temp
 
         rank = dist.get_rank()
         bs = image.shape[0]
 
-        targets = paddle.linspace(
-            start=rank * bs, stop=rank * bs + bs - 1, num=bs).astype(int)
-        one_hot_label = paddle.nn.functional.one_hot(
-            targets, num_classes=sim_i2t.shape[1])
-        smooth_label = paddle.nn.functional.label_smooth(
-            label=one_hot_label, epsilon=0.1)
-        loss_itc = (paddle.nn.functional.cross_entropy(
-            input=sim_i2t, label=smooth_label, soft_label=True) +
-                    paddle.nn.functional.cross_entropy(
-                        input=sim_t2i, label=smooth_label, soft_label=True)) / 2
+        targets = paddle.linspace(start=rank * bs, stop=rank * bs + bs - 1, num=bs).astype(int)
+        one_hot_label = paddle.nn.functional.one_hot(targets, num_classes=sim_i2t.shape[1])
+        smooth_label = paddle.nn.functional.label_smooth(label=one_hot_label, epsilon=0.1)
+        loss_itc = (
+            paddle.nn.functional.cross_entropy(input=sim_i2t, label=smooth_label, soft_label=True)
+            + paddle.nn.functional.cross_entropy(input=sim_t2i, label=smooth_label, soft_label=True)
+        ) / 2
         text_input_ids_world = concat_all_gather(text_tokens.input_ids)
-        text_attention_mask_world = concat_all_gather(
-            text_tokens.attention_mask)
+        text_attention_mask_world = concat_all_gather(text_tokens.attention_mask)
         image_embeds_world = all_gather_with_grad(image_embeds)
         with paddle.no_grad():
-            weights_t2i = paddle.nn.functional.softmax(
-                x=sim_t2i, axis=1) + 0.0001
-            weights_t2i_list = paddle.chunk(
-                weights_t2i,
-                chunks=paddle.distributed.get_world_size(),
-                axis=-1)
+            weights_t2i = paddle.nn.functional.softmax(x=sim_t2i, axis=1) + 0.0001
+            weights_t2i_list = paddle.chunk(weights_t2i, chunks=paddle.distributed.get_world_size(), axis=-1)
             weights_t2i_list[rank].fill_diagonal_(value=0)
             weights_t2i = paddle.concat(weights_t2i_list, axis=-1)
-            weights_i2t = paddle.nn.functional.softmax(
-                x=sim_i2t, axis=1) + 0.0001
-            weights_i2t_list = paddle.chunk(
-                weights_i2t,
-                chunks=paddle.distributed.get_world_size(),
-                axis=-1)
+            weights_i2t = paddle.nn.functional.softmax(x=sim_i2t, axis=1) + 0.0001
+            weights_i2t_list = paddle.chunk(weights_i2t, chunks=paddle.distributed.get_world_size(), axis=-1)
             weights_i2t_list[rank].fill_diagonal_(value=0)
             weights_i2t = paddle.concat(weights_i2t_list, axis=-1)
         image_embeds_neg = []
@@ -661,79 +650,59 @@ def forward_stage1(self, pixel_values, text_input):
             text_atts_neg.append(text_attention_mask_world[neg_idx])
         text_ids_neg = paddle.stack(x=text_ids_neg, axis=0)
         text_atts_neg = paddle.stack(x=text_atts_neg, axis=0)
-        text_ids_all = paddle.concat(
-            x=[text_tokens.input_ids, text_tokens.input_ids, text_ids_neg],
-            axis=0)
+        text_ids_all = paddle.concat(x=[text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], axis=0)
         text_atts_all = paddle.concat(
-            x=[
-                text_tokens.attention_mask, text_tokens.attention_mask,
-                text_atts_neg
-            ],
-            axis=0)
-        query_tokens_itm = self.Qformer.query_tokens.expand(
-            shape=[text_ids_all.shape[0], -1, -1])
-        query_atts_itm = paddle.ones(
-            shape=query_tokens_itm.shape[:-1], dtype='int64')
-        attention_mask_all = paddle.concat(
-            x=[query_atts_itm, text_atts_all], axis=1)
-        image_embeds_all = paddle.concat(
-            x=[image_embeds, image_embeds_neg, image_embeds], axis=0)
-        image_atts_all = paddle.ones(
-            shape=image_embeds_all.shape[:-1], dtype='int64')
+            x=[text_tokens.attention_mask, text_tokens.attention_mask, text_atts_neg], axis=0
+        )
+        query_tokens_itm = self.Qformer.query_tokens.expand(shape=[text_ids_all.shape[0], -1, -1])
+        query_atts_itm = paddle.ones(shape=query_tokens_itm.shape[:-1], dtype="int64")
+        attention_mask_all = paddle.concat(x=[query_atts_itm, text_atts_all], axis=1)
+        image_embeds_all = paddle.concat(x=[image_embeds, image_embeds_neg, image_embeds], axis=0)
+        image_atts_all = paddle.ones(shape=image_embeds_all.shape[:-1], dtype="int64")
         output_itm = self.Qformer.bert(
             text_ids_all,
             query_embeds=query_tokens_itm,
             attention_mask=attention_mask_all,
             encoder_hidden_states=image_embeds_all,
             encoder_attention_mask=image_atts_all,
-            return_dict=True)
-        vl_embeddings = output_itm.last_hidden_state[:, :query_tokens_itm.shape[
-            1], :]
+            return_dict=True,
+        )
+        vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.shape[1], :]
         vl_output = self.Qformer.itm_head(vl_embeddings)
         logits = vl_output.mean(axis=1)
 
-        itm_labels = paddle.concat(
-            [
-                paddle.ones(
-                    [bs], dtype='int64'), paddle.zeros(
-                        [2 * bs], dtype='int64')
-            ],
-            axis=0)
-        loss_itm = paddle.nn.functional.cross_entropy(
-            input=logits, label=itm_labels)
-        ##================= Image Captioning ========================##
+        itm_labels = paddle.concat([paddle.ones([bs], dtype="int64"), paddle.zeros([2 * bs], dtype="int64")], axis=0)
+        loss_itm = paddle.nn.functional.cross_entropy(input=logits, label=itm_labels)
+        # Image Captioning
 
         decoder_input_ids = text_tokens.input_ids.clone()
         decoder_input_ids[:, (0)] = self.tokenizer.bos_token_id
-        labels = masked_fill(decoder_input_ids,
-                             decoder_input_ids == self.tokenizer.pad_token_id,
-                             -100)
-        query_atts = paddle.ones(shape=query_tokens.shape[:-1], dtype='int64')
-        attention_mask = paddle.concat(
-            x=[query_atts, text_tokens.attention_mask], axis=1)
-        #import pdb;pdb.set_trace()
+        labels = masked_fill(decoder_input_ids, decoder_input_ids == self.tokenizer.pad_token_id, -100)
+        query_atts = paddle.ones(shape=query_tokens.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(x=[query_atts, text_tokens.attention_mask], axis=1)
         lm_output = self.Qformer(
             decoder_input_ids,
             attention_mask=attention_mask,
             past_key_values=query_output.past_key_values,
             return_dict=True,
-            labels=labels)
+            labels=labels,
+        )
         loss_lm = lm_output.loss
         return Blip2ForStage1ModelOutput(
-            loss=loss_itc + loss_itm + loss_lm,
-            loss_itc=loss_itc,
-            loss_itm=loss_itm,
-            loss_lm=loss_lm)
+            loss=loss_itc + loss_itm + loss_lm, loss_itc=loss_itc, loss_itm=loss_itm, loss_lm=loss_lm
+        )
 
     @paddle.no_grad()
-    def generate_stage1(self,
-                        samples,
-                        use_nucleus_sampling=False,
-                        num_beams=3,
-                        max_length=30,
-                        min_length=10,
-                        top_p=0.9,
-                        repetition_penalty=1.0):
+    def generate_stage1(
+        self,
+        samples,
+        use_nucleus_sampling=False,
+        num_beams=3,
+        max_length=30,
+        min_length=10,
+        top_p=0.9,
+        repetition_penalty=1.0,
+    ):
         """
         Args:
             samples (dict): A dictionary containing the following keys:
@@ -748,22 +717,16 @@ def generate_stage1(self,
         Returns:
             captions (list): A list of strings of length batch_size * num_captions.
         """
-        image = samples['image']
+        image = samples["image"]
         image_embeds = self.ln_vision(self.visual_encoder(image))
         if not use_nucleus_sampling:
             image_embeds = image_embeds.repeat_interleave(num_beams, axis=0)
         else:
             num_beams = 1
-        image_atts = paddle.ones(shape=image_embeds.shape[:-1], dtype='int64')
-        model_kwargs = {
-            'encoder_hidden_states': image_embeds,
-            'encoder_attention_mask': image_atts
-        }
-        input_ids = paddle.empty(
-            shape=[image.shape[0], 1],
-            dtype='int64').fill_(value=self.tokenizer.bos_token_id)
-        query_tokens = self.query_tokens.expand(
-            shape=[image_embeds.shape[0], -1, -1])
+        image_atts = paddle.ones(shape=image_embeds.shape[:-1], dtype="int64")
+        model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask": image_atts}
+        input_ids = paddle.empty(shape=[image.shape[0], 1], dtype="int64").fill_(value=self.tokenizer.bos_token_id)
+        query_tokens = self.query_tokens.expand(shape=[image_embeds.shape[0], -1, -1])
         outputs = self.Qformer.generate(
             input_ids=input_ids,
             query_embeds=query_tokens,
@@ -774,18 +737,19 @@ def generate_stage1(self,
             top_p=top_p,
             eos_token_id=self.tokenizer.sep_token_id,
             pad_token_id=self.tokenizer.pad_token_id,
-            **model_kwargs)
-        captions = self.tokenizer.batch_decode(
-            outputs, skip_special_tokens=True)
+            **model_kwargs,
+        )
+        captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
         return captions
 
     @paddle.no_grad()
     def generate(
-            self,
-            pixel_values: paddle.Tensor,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            **generate_kwargs, ) -> paddle.Tensor:
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
         """
         Overrides `generate` function to be able to use the model as a conditional generator.
         Args:
@@ -800,32 +764,27 @@ def generate(
         """
         batch_size = pixel_values.shape[0]
         image_embeds = self.Qformer.ln_vision(self.visual_encoder(pixel_values))
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
-        query_tokens = self.Qformer.query_tokens.expand(
-            [image_embeds.shape[0], -1, -1])
+        query_tokens = self.Qformer.query_tokens.expand([image_embeds.shape[0], -1, -1])
         query_outputs = self.Qformer.bert(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         language_model_inputs = self.Qformer.language_projection(query_output)
-        language_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
         if input_ids is None:
-            input_ids = paddle.to_tensor(
-                [[self.config.text_config.bos_token_id]]).tile([batch_size, 1])
+            input_ids = paddle.to_tensor([[self.config.text_config.bos_token_id]]).tile([batch_size, 1])
         if attention_mask is None:
             attention_mask = paddle.ones_like(input_ids)
-        attention_mask = paddle.concat(
-            [language_attention_mask, attention_mask], axis=1)
+        attention_mask = paddle.concat([language_attention_mask, attention_mask], axis=1)
         # concatenate query embeddings with prompt embeddings
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = paddle.concat(
-            [language_model_inputs, inputs_embeds], axis=1)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
@@ -840,63 +799,62 @@ def generate(
             eos_token_id=50118,
             repetition_penalty=1,
             length_penalty=1,
-            num_return_sequences=1, )
+            num_return_sequences=1,
+        )
 
         return outputs
 
     @paddle.no_grad()
     def encode_image(
-            self,
-            pixel_values: paddle.Tensor,
-            **kwargs, ):
-        image_embeds = self.ln_vision(
-            self.visual_encoder(pixel_values.astype("float16")))
+        self,
+        pixel_values: paddle.Tensor,
+        **kwargs,
+    ):
+        image_embeds = self.ln_vision(self.visual_encoder(pixel_values.astype("float16")))
         image_embeds = image_embeds.astype("float32")
 
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
         query_outputs = self.Qformer.bert(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs[0]
         return query_output
 
     @paddle.no_grad()
-    def predict_answers(self,
-                        pixel_values: paddle.Tensor,
-                        input_ids: Optional[paddle.Tensor]=None,
-                        attention_mask: Optional[paddle.Tensor]=None,
-                        max_len=10,
-                        min_len=1,
-                        **kwargs):
-        batch_size = pixel_values.shape[0]
+    def predict_answers(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        max_len=10,
+        min_len=1,
+        **kwargs
+    ):
+        # batch_size = pixel_values.shape[0]
         image_embeds = self.Qformer.ln_vision(self.visual_encoder(pixel_values))
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
-        query_tokens = self.Qformer.query_tokens.expand(
-            [image_embeds.shape[0], -1, -1])
+        query_tokens = self.Qformer.query_tokens.expand([image_embeds.shape[0], -1, -1])
         query_outputs = self.Qformer.bert(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         language_model_inputs = self.Qformer.language_projection(query_output)
-        language_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
 
-        attention_mask = paddle.concat(
-            [language_attention_mask, attention_mask], axis=1)
+        attention_mask = paddle.concat([language_attention_mask, attention_mask], axis=1)
         # concatenate query embeddings with prompt embeddings
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = paddle.concat(
-            [language_model_inputs, inputs_embeds], axis=1)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
@@ -909,7 +867,8 @@ def predict_answers(self,
             min_length=min_len,
             eos_token_id=50118,
             repetition_penalty=1,
-            length_penalty=0, )
+            length_penalty=0,
+        )
 
         return outputs
 
diff --git a/paddlemix/models/blip2/modeling_opt.py b/paddlemix/models/blip2/modeling_opt.py
index 3fdec9f248951..ee9e8250159d1 100644
--- a/paddlemix/models/blip2/modeling_opt.py
+++ b/paddlemix/models/blip2/modeling_opt.py
@@ -25,28 +25,25 @@
 import paddle.nn.functional as F
 import paddle.tensor as tensor
 from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.fluid import layers
 from paddle.nn import Layer
+from paddle.nn.functional.flash_attention import flash_attention
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
-from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from paddle.nn.functional.flash_attention import (flash_attention, )
-
 from paddlenlp.transformers.conversion_utils import StateDictNameMapping
-from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
-from paddlenlp.utils.log import logger
-
 from paddlenlp.transformers.model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, )
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
 from paddlenlp.transformers.opt.configuration import (
     OPT_PRETRAINED_INIT_CONFIGURATION,
     OPT_PRETRAINED_RESOURCE_FILES_MAP,
-    OPTConfig, )
+    OPTConfig,
+)
+from paddlenlp.utils.log import logger
 
-__all__ = [
-    "OPTModel", "OPTPretrainedModel", "OPTForCausalLM",
-    "OPTForConditionalGeneration"
-]
+__all__ = ["OPTModel", "OPTPretrainedModel", "OPTForCausalLM", "OPTForConditionalGeneration"]
 
 
 def finfo(dtype):
@@ -64,24 +61,16 @@ def _make_causal_mask(input_ids_shape, past_key_values_length, dtype):
     """
     batch_size, target_length = input_ids_shape
 
-    mask = paddle.full((target_length, target_length),
-                       float(finfo(paddle.get_default_dtype()).min))
+    mask = paddle.full((target_length, target_length), float(finfo(paddle.get_default_dtype()).min))
 
     mask_cond = paddle.arange(mask.shape[-1])
     mask_cond = mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1])
     mask = paddle.where(mask_cond, paddle.full(mask_cond.shape, 0), mask)
 
     if past_key_values_length > 0:
-        mask = paddle.concat(
-            [
-                paddle.zeros(
-                    [target_length, past_key_values_length], dtype=mask.dtype),
-                mask
-            ],
-            axis=-1)
-
-    expanded_mask = mask.unsqueeze(0).expand(
-        [batch_size, 1, target_length, target_length + past_key_values_length])
+        mask = paddle.concat([paddle.zeros([target_length, past_key_values_length], dtype=mask.dtype), mask], axis=-1)
+
+    expanded_mask = mask.unsqueeze(0).expand([batch_size, 1, target_length, target_length + past_key_values_length])
     return expanded_mask
 
 
@@ -95,9 +84,8 @@ def _expand_mask(mask, tgt_length):
     expanded_mask = ~(paddle.cast(mask[:, None, None, :], "bool"))
     expanded_mask = paddle.cast(expanded_mask, dtype=paddle.float32)
 
-    expanded_mask = expanded_mask.expand(
-        [batch_size, 1, tgt_length, src_length])
-    expanded_mask = expanded_mask * float(finfo('float16').min)
+    expanded_mask = expanded_mask.expand([batch_size, 1, tgt_length, src_length])
+    expanded_mask = expanded_mask * float(finfo("float16").min)
     return expanded_mask
 
 
@@ -113,9 +101,10 @@ class MultiHeadAttention(nn.Layer):
     StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
 
     def __init__(
-            self,
-            config: OPTConfig,
-            need_weights=False, ):
+        self,
+        config: OPTConfig,
+        need_weights=False,
+    ):
         super(MultiHeadAttention, self).__init__()
         self.use_flash_attn = config.get("use_flash_attn", False)
         self.num_heads = config.num_attention_heads
@@ -131,8 +120,8 @@ def __init__(
         self.mp_degree = config.mp_degree
 
         assert (
-            self.head_dim * self.num_heads * config.mp_degree ==
-            config.hidden_size), "hidden_size must be divisible by num_heads"
+            self.head_dim * self.num_heads * config.mp_degree == config.hidden_size
+        ), "hidden_size must be divisible by num_heads"
 
         if config.mp_degree > 1:
             if self.fuse_attention_qkv:
@@ -140,33 +129,34 @@ def __init__(
                     config.hidden_size,
                     config.hidden_size * 3,
                     has_bias=True,
-                    input_is_parallel=True, )
+                    input_is_parallel=True,
+                )
             else:
                 self.q_proj = fleet.meta_parallel.ColumnParallelLinear(
                     config.hidden_size,
                     config.hidden_size,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
                 self.k_proj = fleet.meta_parallel.ColumnParallelLinear(
                     config.hidden_size,
                     config.hidden_size,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
                 self.v_proj = fleet.meta_parallel.ColumnParallelLinear(
                     config.hidden_size,
                     config.hidden_size,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
 
             self.out_proj = fleet.meta_parallel.RowParallelLinear(
-                config.hidden_size,
-                config.hidden_size,
-                input_is_parallel=True,
-                has_bias=True)
+                config.hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True
+            )
         else:
             if self.fuse_attention_qkv:
-                self.qkv_proj = nn.Linear(config.hidden_size,
-                                          3 * config.hidden_size)
+                self.qkv_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size)
             else:
                 self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
                 self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
@@ -176,15 +166,13 @@ def __init__(
 
     def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):
         mix_layer = self.qkv_proj(query)
-        mix_layer = paddle.reshape_(mix_layer,
-                                    [0, 0, self.num_heads, 3 * self.head_dim])
+        mix_layer = paddle.reshape_(mix_layer, [0, 0, self.num_heads, 3 * self.head_dim])
         if not self.use_flash_attn:
             mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3])
+
         q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)
 
-        assert not isinstance(
-            cache, self.StaticCache
-        ), "cache currently does not support the StaticCache type"
+        assert not isinstance(cache, self.StaticCache), "cache currently does not support the StaticCache type"
 
         if isinstance(cache, self.Cache):
             # for decoder self-attention in inference
@@ -262,29 +250,19 @@ def gen_cache(self, key, value=None, type=Cache):
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
             k = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0)
+                input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0
+            )
             v = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0)
+                input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0
+            )
             return self.Cache(k, v)
         else:
             # incremental_state with initial value, mainly for usage like UniLM
             return self.Cache(key, value)
 
-    def forward(self,
-                query,
-                key,
-                value,
-                attn_mask=None,
-                use_cache=False,
-                cache=None,
-                output_attention=None,
-                is_causal=True):
+    def forward(
+        self, query, key, value, attn_mask=None, use_cache=False, cache=None, output_attention=None, is_causal=True
+    ):
         r"""
         Applies multi-head attention to map queries and a set of key-value pairs
         to outputs.
@@ -295,8 +273,7 @@ def forward(self,
         if self.fuse_attention_qkv:
             q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache)
         else:
-            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
-                                               cache)
+            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache)
         if self.use_flash_attn:
             bsz, q_len, num_heads, head_dim = q.shape
             out, weights = flash_attention(
@@ -305,12 +282,12 @@ def forward(self,
                 v,
                 causal=is_causal and q.shape[1] != 1,
                 return_softmax=self.need_weights and output_attention,
-                dropout=self.dropout)
+                dropout=self.dropout,
+            )
             out = out.reshape([bsz, q_len, head_dim * num_heads])
         # scale dot product attention
         else:
-            product = paddle.matmul(
-                x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
+            product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
 
             if attn_mask is not None:
                 product = product + attn_mask
@@ -319,24 +296,15 @@ def forward(self,
             if self.dropout:
                 if self.mp_degree > 1:
                     with get_rng_state_tracker().rng_state("local_seed"):
-                        weights = F.dropout(
-                            weights,
-                            self.dropout,
-                            training=self.training,
-                            mode="upscale_in_train")
+                        weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train")
                 else:
-                    weights = F.dropout(
-                        weights,
-                        self.dropout,
-                        training=self.training,
-                        mode="upscale_in_train")
+                    weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train")
 
             out = tensor.matmul(weights, v)
 
             # combine heads
             out = tensor.transpose(out, perm=[0, 2, 1, 3])
-            out = tensor.reshape(
-                x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+            out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
         # project to output
         out = self.out_proj(out)
@@ -364,8 +332,7 @@ def __init__(self, config):
         act_dropout = config.hidden_dropout_prob
         normalize_before = getattr(config, "normalize_before", True)
 
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=config.initializer_range))
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range))
         bias_attr = None
 
         self._config = locals()
@@ -383,18 +350,16 @@ def __init__(self, config):
         self.self_attn = MultiHeadAttention(config, need_weights=True)
         if config.mp_degree > 1:
             self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
-                d_model, dim_feedforward, has_bias=True, gather_output=True)
+                d_model, dim_feedforward, has_bias=True, gather_output=True
+            )
 
         else:
-            self.linear1 = nn.Linear(
-                d_model,
-                dim_feedforward,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
+            self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
 
         if config.mp_degree > 1:
             self.linear2 = fleet.meta_parallel.ColumnParallelLinear(
-                dim_feedforward, d_model, has_bias=True, gather_output=True)
+                dim_feedforward, d_model, has_bias=True, gather_output=True
+            )
             """
             self.linear2 = fleet.meta_parallel.RowParallelLinear(
                     dim_feedforward,
@@ -404,11 +369,7 @@ def __init__(self, config):
                 )
             """
         else:
-            self.linear2 = nn.Linear(
-                dim_feedforward,
-                d_model,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
+            self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
 
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
@@ -421,13 +382,7 @@ def __init__(self, config):
             self.activation = getattr(F, activation)
         self.mp_degree = config.mp_degree
 
-    def forward(self,
-                tgt,
-                memory,
-                tgt_mask=None,
-                use_cache=False,
-                cache=None,
-                output_attentions=False):
+    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None, output_attentions=False):
         residual = tgt
 
         if self.normalize_before:
@@ -435,17 +390,9 @@ def forward(self,
 
         # self.self_attn(...) --> hidden_states, weights, (cache)
         if use_cache is False:
-            tgt, attn_weights = self.self_attn(
-                tgt,
-                tgt,
-                tgt,
-                tgt_mask,
-                use_cache,
-                cache,
-                output_attention=None)
+            tgt, attn_weights = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache, output_attention=None)
         else:
-            tgt, attn_weights, incremental_cache = self.self_attn(
-                tgt, tgt, tgt, tgt_mask, use_cache, cache)
+            tgt, attn_weights, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
         if self.mp_degree > 1:
             with get_rng_state_tracker().rng_state("global_seed"):
                 tgt = residual + self.dropout1(tgt)
@@ -459,11 +406,9 @@ def forward(self,
             tgt = self.norm2(tgt)
         if self.mp_degree > 1:
             with get_rng_state_tracker().rng_state("global_seed"):
-                tgt = self.dropout2(
-                    self.linear2(self.activation(self.linear1(tgt))))
+                tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt))))
         else:
-            tgt = self.dropout2(
-                self.linear2(self.activation(self.linear1(tgt))))
+            tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt))))
         tgt = residual + tgt
 
         if not self.normalize_before:
@@ -472,15 +417,12 @@ def forward(self,
         if not (output_attentions or use_cache):
             return tgt
 
-        temp_list = [
-            tgt, attn_weights, incremental_cache if use_cache else None
-        ]
+        temp_list = [tgt, attn_weights, incremental_cache if use_cache else None]
 
         return tuple(v for v in temp_list if v is not None)
 
     def gen_cache(self, memory):
-        incremental_cache = self.self_attn.gen_cache(
-            memory, type=self.self_attn.Cache)
+        incremental_cache = self.self_attn.gen_cache(memory, type=self.self_attn.Cache)
         return incremental_cache
 
 
@@ -498,18 +440,15 @@ def __init__(self, config: OPTConfig, decoder_layers: List[Layer]):
                     config.hidden_size,
                     config.word_embed_proj_dim,
                     gather_output=True,
-                    has_bias=False, )
+                    has_bias=False,
+                )
             else:
                 if config.use_fusedlinear:
                     self.project_out = paddle.incubate.nn.FusedLinear(
-                        config.hidden_size,
-                        config.word_embed_proj_dim,
-                        bias_attr=False)
+                        config.hidden_size, config.word_embed_proj_dim, bias_attr=False
+                    )
                 else:
-                    self.project_out = nn.Linear(
-                        config.hidden_size,
-                        config.word_embed_proj_dim,
-                        bias_attr=False)
+                    self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias_attr=False)
         else:
             self.project_out = None
 
@@ -524,16 +463,17 @@ def __init__(self, config: OPTConfig, decoder_layers: List[Layer]):
         self.checkpoints = []
 
     def forward(
-            self,
-            tgt,
-            memory,
-            tgt_mask=None,
-            memory_mask=None,
-            use_cache: bool=False,
-            cache=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=False, ):
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        use_cache: bool = False,
+        cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
         r"""
         Applies a stack of N Transformer decoder layers on inputs. If `norm` is
         provided, also applies layer normalization on the output of last decoder
@@ -552,18 +492,19 @@ def forward(
                 tgt_mask=tgt_mask,
                 use_cache=use_cache,
                 cache=cache[i] if cache is not None else cache,
-                output_attentions=output_attentions, )
+                output_attentions=output_attentions,
+            )
 
             # outputs = hidden_states if both use_cache and output_attentions are False
             # Otherwise, outputs = (hidden_states, attention if output_attentions, cache if use_cache)
             output = outputs[0] if (use_cache or output_attentions) else outputs
 
             if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[1], )
+                all_self_attentions = all_self_attentions + (outputs[1],)
             if use_cache:
                 new_caches.append(outputs[-1])
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (output, )
+                all_hidden_states = all_hidden_states + (output,)
             self.checkpoints.append(output.name)
 
         if self.final_layer_norm:
@@ -573,9 +514,7 @@ def forward(
             output = self.project_out(output)
 
         if not return_dict:
-            temp_list = [
-                output, new_caches, all_hidden_states, all_self_attentions
-            ]
+            temp_list = [output, new_caches, all_hidden_states, all_self_attentions]
 
             if not (use_cache or output_attentions or output_hidden_states):
                 return output
@@ -587,7 +526,8 @@ def forward(
             past_key_values=new_caches,
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
-            cross_attentions=None, )
+            cross_attentions=None,
+        )
 
     def gen_cache(self, memory, do_zip=False):
         r"""
@@ -606,10 +546,7 @@ def gen_cache(self, memory, do_zip=False):
 class OPTLearnedPositionEmbedding(nn.Embedding):
     """this module learns postional embeddings up to a fixed maximum size"""
 
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 initializer_range: float):
+    def __init__(self, num_embeddings: int, embedding_dim: int, initializer_range: float):
         """OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
         and adjust num_embeddings appropriately. Other models don't have this hack.
 
@@ -620,7 +557,7 @@ def __init__(self,
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, attention_mask, past_key_values_length: int=0):
+    def forward(self, attention_mask, past_key_values_length: int = 0):
         """get the position embedding with attention mask
 
         Args:
@@ -634,8 +571,7 @@ def forward(self, attention_mask, past_key_values_length: int=0):
         if attention_mask.dtype not in [paddle.bool, paddle.int64]:
             attention_mask = attention_mask == 1.0
 
-        position_ids = paddle.cumsum(
-            paddle.cast(attention_mask, "int64"), axis=-1) * attention_mask - 1
+        position_ids = paddle.cumsum(paddle.cast(attention_mask, "int64"), axis=-1) * attention_mask - 1
 
         # cut positions if `past_key_values_length` is > 0
         position_ids = position_ids[:, past_key_values_length:]
@@ -653,15 +589,19 @@ def __init__(self, config: OPTConfig):
             self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
                 config.vocab_size,
                 config.word_embed_proj_dim,
-                weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
-                    mean=0.0, std=config.initializer_range)), )
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)
+                ),
+            )
         else:
             self.word_embeddings = nn.Embedding(
                 config.vocab_size,
                 config.word_embed_proj_dim,
                 # padding_idx=config.pad_token_id,
-                weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
-                    mean=0.0, std=config.initializer_range)), )
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)
+                ),
+            )
 
         if config.word_embed_proj_dim != config.hidden_size:
             if config.mp_degree > 1:
@@ -669,41 +609,34 @@ def __init__(self, config: OPTConfig):
                     config.word_embed_proj_dim,
                     config.hidden_size,
                     gather_output=True,
-                    has_bias=False, )
+                    has_bias=False,
+                )
             else:
                 if config.use_fusedlinear:
                     self.project_in = paddle.incubate.nn.FusedLinear(
-                        config.word_embed_proj_dim,
-                        config.hidden_size,
-                        bias_attr=False)
+                        config.word_embed_proj_dim, config.hidden_size, bias_attr=False
+                    )
                 else:
-                    self.project_in = nn.Linear(
-                        config.word_embed_proj_dim,
-                        config.hidden_size,
-                        bias_attr=False)
+                    self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias_attr=False)
         else:
             self.project_in = None
 
         self.position_embeddings = OPTLearnedPositionEmbedding(
             num_embeddings=config.max_position_embeddings,
             embedding_dim=config.hidden_size,
-            initializer_range=config.initializer_range, )
+            initializer_range=config.initializer_range,
+        )
         self.mp_degree = config.mp_degree
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                input_embeddings=None,
-                past_key_values_length=None):
+    def forward(self, input_ids=None, attention_mask=None, input_embeddings=None, past_key_values_length=None):
         if input_ids is not None:
             input_embeddings = self.word_embeddings(input_ids)
 
         if self.project_in:
             input_embeddings = self.project_in(input_embeddings)
 
-        position_embeddings = self.position_embeddings(attention_mask,
-                                                       past_key_values_length)
+        position_embeddings = self.position_embeddings(attention_mask, past_key_values_length)
 
         embeddings = input_embeddings + position_embeddings
         if self.mp_degree > 1:
@@ -738,37 +671,32 @@ def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True):
             is_split=is_split,
             tensor_parallel_degree=config.tensor_parallel_degree,
             tensor_parallel_rank=config.tensor_parallel_rank,
-            num_attention_heads=config.num_attention_heads, )
-        actions = {"word_embeddings.weight": partial(fn, is_column=False), }
+            num_attention_heads=config.num_attention_heads,
+        )
+        actions = {
+            "word_embeddings.weight": partial(fn, is_column=False),
+        }
         for layer_index in range(config.num_hidden_layers):
-            actions.update({
-                # Column Linear
-                f"decoder.layers.{layer_index}.self_attn.q_proj.weight":
-                partial(
-                    fn, is_column=True),
-                f"decoder.layers.{layer_index}.self_attn.k_proj.weight":
-                partial(
-                    fn, is_column=True),
-                f"decoder.layers.{layer_index}.self_attn.v_proj.weight":
-                partial(
-                    fn, is_column=True),
-                f"decoder.layers.{layer_index}.linear1.weight": partial(
-                    fn, is_column=True),
-                # Row Linear
-                f"decoder.layers.{layer_index}.linear2.weight": partial(
-                    fn, is_column=False),
-                f"decoder.layers.{layer_index}.self_attn.out_proj.weight":
-                partial(
-                    fn, is_column=False),
-            })
+            actions.update(
+                {
+                    # Column Linear
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.linear1.weight": partial(fn, is_column=True),
+                    # Row Linear
+                    f"decoder.layers.{layer_index}.linear2.weight": partial(fn, is_column=False),
+                    f"decoder.layers.{layer_index}.self_attn.out_proj.weight": partial(fn, is_column=False),
+                }
+            )
 
         if config.word_embed_proj_dim != config.hidden_size:
-            actions.update({
-                "decoder.project_out.weight": partial(
-                    fn, is_column=True),
-                "decoder.project_in.weight": partial(
-                    fn, is_column=True),
-            })
+            actions.update(
+                {
+                    "decoder.project_out.weight": partial(fn, is_column=True),
+                    "decoder.project_in.weight": partial(fn, is_column=True),
+                }
+            )
 
         if cls.__name__ != "OPTModel":
             for key in list(actions.keys()):
@@ -777,22 +705,12 @@ def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True):
         return actions
 
     @classmethod
-    def _get_name_mappings(cls,
-                           config: OPTConfig) -> list[StateDictNameMapping]:
+    def _get_name_mappings(cls, config: OPTConfig) -> list[StateDictNameMapping]:
         mappings: list[StateDictNameMapping] = []
         model_mappings = [
-            [
-                "decoder.embed_tokens.weight",
-                "embeddings.word_embeddings.weight"
-            ],
-            [
-                "decoder.embed_positions.weight",
-                "embeddings.position_embeddings.weight"
-            ],
-            [
-                "decoder.final_layer_norm.weight",
-                "decoder.final_layer_norm.weight"
-            ],
+            ["decoder.embed_tokens.weight", "embeddings.word_embeddings.weight"],
+            ["decoder.embed_positions.weight", "embeddings.position_embeddings.weight"],
+            ["decoder.final_layer_norm.weight", "decoder.final_layer_norm.weight"],
             ["decoder.final_layer_norm.bias", "decoder.final_layer_norm.bias"],
         ]
         for layer_index in range(config.num_hidden_layers):
@@ -846,27 +764,18 @@ def _get_name_mappings(cls,
                     f"decoder.layers.{layer_index}.linear1.weight",
                     "transpose",
                 ],
-                [
-                    f"decoder.layers.{layer_index}.fc1.bias",
-                    f"decoder.layers.{layer_index}.linear1.bias"
-                ],
+                [f"decoder.layers.{layer_index}.fc1.bias", f"decoder.layers.{layer_index}.linear1.bias"],
                 [
                     f"decoder.layers.{layer_index}.fc2.weight",
                     f"decoder.layers.{layer_index}.linear2.weight",
                     "transpose",
                 ],
-                [
-                    f"decoder.layers.{layer_index}.fc2.bias",
-                    f"decoder.layers.{layer_index}.linear2.bias"
-                ],
+                [f"decoder.layers.{layer_index}.fc2.bias", f"decoder.layers.{layer_index}.linear2.bias"],
                 [
                     f"decoder.layers.{layer_index}.final_layer_norm.weight",
                     f"decoder.layers.{layer_index}.norm2.weight",
                 ],
-                [
-                    f"decoder.layers.{layer_index}.final_layer_norm.bias",
-                    f"decoder.layers.{layer_index}.norm2.bias"
-                ],
+                [f"decoder.layers.{layer_index}.final_layer_norm.bias", f"decoder.layers.{layer_index}.norm2.bias"],
             ]
             model_mappings.extend(layer_mappings)
 
@@ -877,17 +786,12 @@ def _get_name_mappings(cls,
                 mapping[1] = "opt." + mapping[1]
 
         # downstream mappings
-        mappings = [
-            StateDictNameMapping(
-                *mapping, index=index)
-            for index, mapping in enumerate(model_mappings)
-        ]
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
         return mappings
 
     def _init_weights(self, layer):
         """Initialization hook"""
-        if isinstance(layer, (paddle.incubate.nn.FusedLinear, nn.Linear,
-                              nn.Embedding)):
+        if isinstance(layer, (paddle.incubate.nn.FusedLinear, nn.Linear, nn.Embedding)):
             # In the dygraph mode, use the `set_value` to reset the parameter directly,
             # and reset the `state_dict` to update parameter in static mode.
             if isinstance(layer.weight, paddle.Tensor):
@@ -895,9 +799,11 @@ def _init_weights(self, layer):
                     paddle.tensor.normal(
                         mean=0.0,
                         std=self.initializer_range
-                        if hasattr(self, "initializer_range") else
-                        self.opt.config["initializer_range"],
-                        shape=layer.weight.shape, ))
+                        if hasattr(self, "initializer_range")
+                        else self.opt.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
 
 
 @register_base_model
@@ -932,38 +838,36 @@ def __init__(self, config: OPTConfig):
         self.decoder = TransformerDecoder(config, decoder_layers)
         self.checkpoints = []
 
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        past_key_values_length):
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length):
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         combined_attention_mask = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
-                input_shape,
-                past_key_values_length=past_key_values_length,
-                dtype=attention_mask.dtype)
+                input_shape, past_key_values_length=past_key_values_length, dtype=attention_mask.dtype
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(
-                attention_mask, tgt_length=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, tgt_length=input_shape[-1])
             combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else
-                expanded_attn_mask + combined_attention_mask)
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
 
         return combined_attention_mask
 
     def forward(
-            self,
-            input_ids=None,
-            position_ids=None,
-            attention_mask=None,
-            inputs_embeds=None,
-            use_cache=False,
-            cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None, ):
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
         r"""
         The OPTModel forward method, overrides the `__call__()` special method.
 
@@ -1031,42 +935,37 @@ def forward(
             logger.warning("position_ids has not required for OPTModel.")
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = paddle.shape(input_ids)
             input_ids = input_ids.reshape((-1, input_shape[-1]))
         elif inputs_embeds is not None:
             input_shape = paddle.shape(inputs_embeds)[:-1]
         else:
-            raise ValueError(
-                "You have to specify either input_ids or inputs_embeds")
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         self.checkpoints = []
-        past_key_values_length = paddle.shape(cache[0].k)[
-            2] if cache is not None else 0
+        past_key_values_length = paddle.shape(cache[0].k)[2] if cache is not None else 0
 
         seq_length_with_past = input_shape[-1] + past_key_values_length
 
         if attention_mask is None:
-            attention_mask = paddle.ones(
-                (input_shape[0], seq_length_with_past), dtype=paddle.bool)
+            attention_mask = paddle.ones((input_shape[0], seq_length_with_past), dtype=paddle.bool)
 
         embedding_output = self.embeddings(
             input_ids=input_ids,
             attention_mask=attention_mask,
             input_embeddings=inputs_embeds,
-            past_key_values_length=past_key_values_length, )
+            past_key_values_length=past_key_values_length,
+        )
 
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, past_key_values_length)
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
         attention_mask.stop_gradient = True
 
         outputs = self.decoder.forward(
@@ -1077,17 +976,17 @@ def forward(
             cache=cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         if output_hidden_states:
             if return_dict:
-                outputs.hidden_states = (embedding_output,
-                                         ) + outputs.hidden_states
+                outputs.hidden_states = (embedding_output,) + outputs.hidden_states
             else:
                 # [last_hidden_state, caches, all_hidden_states, all_self_attentions]
                 idx = 2 if use_cache else 1
-                all_hidden_states = ((embedding_output, ) + outputs[idx], )
-                outputs = outputs[:idx] + all_hidden_states + outputs[idx + 1:]
+                all_hidden_states = ((embedding_output,) + outputs[idx],)
+                outputs = outputs[:idx] + all_hidden_states + outputs[idx + 1 :]
 
         self.checkpoints.extend(self.decoder.checkpoints)
         return outputs
@@ -1108,22 +1007,19 @@ def set_input_embeddings(self, embedding: nn.Embedding):
 
 
 class OPTLMHead(Layer):
-    def __init__(self,
-                 hidden_size: int,
-                 vocab_size: int,
-                 embedding_weights=None):
+    def __init__(self, hidden_size: int, vocab_size: int, embedding_weights=None):
         super(OPTLMHead, self).__init__()
-        self.decoder_weight = (self.create_parameter(
-            shape=[vocab_size, hidden_size],
-            dtype=paddle.get_default_dtype(),
-            is_bias=True) if embedding_weights is None else embedding_weights)
+        self.decoder_weight = (
+            self.create_parameter(shape=[vocab_size, hidden_size], dtype=paddle.get_default_dtype(), is_bias=True)
+            if embedding_weights is None
+            else embedding_weights
+        )
 
     def forward(self, hidden_states):
         if isinstance(hidden_states, BaseModelOutputWithPastAndCrossAttentions):
             hidden_states = hidden_states["last_hidden_state"]
 
-        logits = paddle.tensor.matmul(
-            hidden_states, self.decoder_weight, transpose_y=True)
+        logits = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True)
         return logits
 
 
@@ -1139,9 +1035,10 @@ class OPTForCausalLM(OPTPretrainedModel):
 
     def __init__(self, config: OPTConfig, **kwargs):
         super(OPTForCausalLM, self).__init__(config)
-        from paddle.distributed import fleet
+
         config.use_fusedlinear = config.get("use_fusedlinear", False)
         config.mp_degree = config.mp_degree
+
         self.opt = OPTModel(config)
         self.lm_head = OPTLMHead(
             hidden_size=self.opt.config.hidden_size,
@@ -1150,17 +1047,18 @@ def __init__(self, config: OPTConfig, **kwargs):
         )
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            inputs_embeds=None,
-            labels=None,
-            use_cache=False,
-            cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
-            **kwargs, ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
         r"""
 
         Args:
@@ -1207,9 +1105,10 @@ def forward(
                 print(tokenizer.batch_decode(output_ids[0]))
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.opt(
@@ -1220,7 +1119,8 @@ def forward(
             cache=cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         if use_cache:
             encoder_outputs, cached_kvs = outputs[:2]
@@ -1231,12 +1131,13 @@ def forward(
 
         loss = None
         if labels is not None:
-            logits = logits[:, -labels.shape[1]:, :]
+            logits = logits[:, -labels.shape[1] :, :]
             shift_logits = logits[:, :-1, :]
             shift_labels = labels[:, 1:]
 
-            loss_fct = CrossEntropyLoss(reduction='mean', label_smoothing=None)
-            labels = shift_labels.reshape((-1, ))
+            loss_fct = CrossEntropyLoss(reduction="mean", label_smoothing=None)
+            labels = shift_labels.reshape((-1,))
+
             valid_index = paddle.where(labels != -100)[0].flatten()
             logits = shift_logits.reshape((-1, shift_logits.shape[-1]))
             logits = paddle.gather(logits, valid_index, axis=0)
@@ -1249,8 +1150,8 @@ def forward(
             if not use_cache:
                 return (loss, logits) if loss is not None else logits
 
-            outputs = (logits, ) + outputs[1:]
-            return ((loss, ) + outputs) if loss is not None else outputs
+            outputs = (logits,) + outputs[1:]
+            return ((loss,) + outputs) if loss is not None else outputs
 
         return CausalLMOutputWithCrossAttentions(
             loss=loss,
@@ -1258,7 +1159,8 @@ def forward(
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions, )
+            cross_attentions=outputs.cross_attentions,
+        )
 
     def prepare_fast_entry(self, kwargs: Dict[str, Any]):
         # import FasterOPT at here to avoid cycling import
@@ -1270,37 +1172,27 @@ def prepare_fast_entry(self, kwargs: Dict[str, Any]):
         decoding_lib = kwargs.get("decoding_lib", None)
 
         if decode_strategy == "beam_search":
-            raise AttributeError(
-                "'beam_search' is not supported yet in the fast version of OPT")
+            raise AttributeError("'beam_search' is not supported yet in the fast version of OPT")
         # Currently, FasterTransformer only support restricted size_per_head.
-        size_per_head = self.opt.config["hidden_size"] // self.opt.config[
-            "num_attention_heads"]
+
+        size_per_head = self.opt.config["hidden_size"] // self.opt.config["num_attention_heads"]
+
         if size_per_head not in [32, 64, 80, 96, 128]:
             raise AttributeError(
-                "'size_per_head = %d' is not supported yet in the fast version of OPT"
-                % size_per_head)
+                "'size_per_head = %d' is not supported yet in the fast version of OPT" % size_per_head
+            )
         if kwargs["forced_bos_token_id"] is not None:
             # not support for forced_bos_token_id yet in the fast version
-            raise AttributeError(
-                "'forced_bos_token_id != None' is not supported yet in the fast version"
-            )
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
         if kwargs["min_length"] != 0:
             # not support for min_length yet in the fast version
-            raise AttributeError(
-                "'min_length != 0' is not supported yet in the fast version")
-        self._fast_entry = FasterOPT(
-            self,
-            use_fp16_decoding=use_fp16_decoding,
-            decoding_lib=decoding_lib).forward
+            raise AttributeError("'min_length != 0' is not supported yet in the fast version")
+        self._fast_entry = FasterOPT(self, use_fp16_decoding=use_fp16_decoding, decoding_lib=decoding_lib).forward
         return self._fast_entry
 
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      use_cache=False,
-                                      cache=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, cache=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
         if cache is not None:
             input_ids = input_ids[:, -1:]
 
@@ -1310,20 +1202,23 @@ def prepare_inputs_for_generation(self,
         else:
             model_inputs = {"input_ids": input_ids}
 
-        model_inputs.update({
-            "cache": cache,
-            "use_cache": True,
-            "attention_mask": attention_mask,
-        })
+        model_inputs.update(
+            {
+                "cache": cache,
+                "use_cache": True,
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
 
     @staticmethod
-    def prepare_attention_mask_for_generation(input_ids, pad_token_id,
-                                              eos_token_id):
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
         is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(
-            input_ids == pad_token_id).numpy().item()
+            input_ids == pad_token_id
+        ).numpy().item()
         is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
-            (eos_token_id is not None) and (pad_token_id != eos_token_id))
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
         if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
             attention_mask = (input_ids != pad_token_id).astype("int64")
         else:
@@ -1348,7 +1243,7 @@ class CrossEntropyLoss(nn.Layer):
     Softmax Cross entropy loss
     """
 
-    def __init__(self, reduction='mean', label_smoothing=None):
+    def __init__(self, reduction="mean", label_smoothing=None):
         super().__init__()
         if label_smoothing is not None:
             assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]"
@@ -1378,12 +1273,12 @@ def forward(self, x, label):
                 loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1)
             else:
                 if label.dtype == paddle.int32:
-                    label = paddle.cast(label, 'int64')
+                    label = paddle.cast(label, "int64")
                 loss = F.cross_entropy(x, label=label, soft_label=False)
 
-        if self.reduction == 'sum':
+        if self.reduction == "sum":
             return loss.sum()
-        elif self.reduction == 'mean':
+        elif self.reduction == "mean":
             return loss.mean()
         else:
             return loss
diff --git a/paddlemix/models/blip2/modeling_utils.py b/paddlemix/models/blip2/modeling_utils.py
index 35f9105ae3402..dad18fac98637 100644
--- a/paddlemix/models/blip2/modeling_utils.py
+++ b/paddlemix/models/blip2/modeling_utils.py
@@ -13,12 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
+
 import numpy as np
-import paddle.nn.functional as F
+import paddle
 import paddle.nn as nn
-from paddlemix.utils.log import logger
-import time
+import paddle.nn.functional as F
+
+
 def disabled_train(self, mode=True):
     """Overwrite model.train with this function to make sure train/eval mode
     does not change anymore."""
@@ -35,7 +36,7 @@ def concat_all_gather(tensor):
         return tensor
 
     tensors_gather = []
-    paddle.distributed.all_gather(tensors_gather, tensor,sync_op=False)
+    paddle.distributed.all_gather(tensors_gather, tensor, sync_op=False)
 
     output = paddle.concat(tensors_gather, axis=0)
     return output
@@ -47,8 +48,7 @@ def tile(x, dim, n_tile):
     repeat_idx[dim] = n_tile
     x = x.repeat(*(repeat_idx))
     order_index = paddle.to_tensor(
-        np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]),
-        dtype='int64'
+        np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]), dtype="int64"
     )
     return paddle.index_select(x, dim, order_index)
 
@@ -67,12 +67,13 @@ def all_gather_with_grad(tensors):
     tensor_all = GatherLayer.apply(tensors)
     return paddle.concat(tensor_all, axis=0)
 
+
 class CrossEntropyLoss(nn.Layer):
     """
     Softmax Cross entropy loss
     """
 
-    def __init__(self, reduction='mean', label_smoothing=None):
+    def __init__(self, reduction="mean", label_smoothing=None):
         super().__init__()
         if label_smoothing is not None:
             assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]"
@@ -102,16 +103,17 @@ def forward(self, x, label):
                 loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1)
             else:
                 if label.dtype == paddle.int32:
-                    label = paddle.cast(label, 'int64')
+                    label = paddle.cast(label, "int64")
                 loss = F.cross_entropy(x, label=label, soft_label=False)
 
-        if self.reduction == 'sum':
+        if self.reduction == "sum":
             return loss.sum()
-        elif self.reduction == 'mean':
+        elif self.reduction == "mean":
             return loss.mean()
         else:
             return loss
 
+
 class GatherLayer(paddle.autograd.PyLayer):
     """
     Gather tensors from all workers with support for backward propagation:
@@ -126,11 +128,11 @@ def forward(ctx, x):
 
     @staticmethod
     def backward(ctx, *grads):
-        # print(grads)
         all_gradients = paddle.stack(grads)
         paddle.distributed.all_reduce(all_gradients)
         return all_gradients[paddle.distributed.get_rank()]
 
+
 def masked_fill(x, mask, value):
     y = paddle.full(x.shape, value, x.dtype)
     return paddle.where(mask, y, x)
diff --git a/paddlemix/models/common/distributed_utils.py b/paddlemix/models/common/distributed_utils.py
index d5ab3eab20ca3..2004bd1345360 100644
--- a/paddlemix/models/common/distributed_utils.py
+++ b/paddlemix/models/common/distributed_utils.py
@@ -30,9 +30,7 @@ def forward(ctx, tensor, group=None):
         else:
             rank = dist.get_rank()
             world_size = dist.get_world_size()
-        tensors_gather = [
-            paddle.empty_like(x=tensor) for _ in range(world_size)
-        ]
+        tensors_gather = [paddle.empty_like(x=tensor) for _ in range(world_size)]
         paddle.distributed.all_gather(tensors_gather, tensor, group=group)
         ctx.rank = rank
         ctx.batch_size = tensor.shape[0]
@@ -40,8 +38,7 @@ def forward(ctx, tensor, group=None):
 
     @staticmethod
     def backward(ctx, grad_output):
-        return grad_output[ctx.batch_size * ctx.rank:ctx.batch_size * (ctx.rank
-                                                                       + 1)]
+        return grad_output[ctx.batch_size * ctx.rank : ctx.batch_size * (ctx.rank + 1)]
 
 
 allgather = AllGather.apply
diff --git a/paddlemix/models/evaclip/eva_clip_model.py b/paddlemix/models/evaclip/eva_clip_model.py
index 798ef23093732..91ce413de1be4 100644
--- a/paddlemix/models/evaclip/eva_clip_model.py
+++ b/paddlemix/models/evaclip/eva_clip_model.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle
-from paddlenlp.transformers.convbert.modeling import ConvBertClassificationHead
+
 """ CLIP Model
 
 Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
@@ -37,10 +37,11 @@ class EVACLIPConfig(PretrainedConfig):
     model_type = "evaclip"
 
     def __init__(
-            self,
-            vision_cfg={},
-            text_cfg={},
-            **kwargs, ):
+        self,
+        vision_cfg={},
+        text_cfg={},
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -49,48 +50,46 @@ def __init__(
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path: Union[str, os.PathLike]=None,
-            pretrained_vismodel_name_or_path: Union[str, os.PathLike]=None,
-            pretrained_textmodel_name_or_path: Union[str, os.PathLike]=None,
-            **kwargs, ) -> "PretrainedConfig":
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike] = None,
+        pretrained_vismodel_name_or_path: Union[str, os.PathLike] = None,
+        pretrained_textmodel_name_or_path: Union[str, os.PathLike] = None,
+        **kwargs,
+    ) -> "PretrainedConfig":
         assert pretrained_model_name_or_path is not None or (
-            pretrained_vismodel_name_or_path is not None and
-            pretrained_textmodel_name_or_path is not None
-        ), (f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but"
+            pretrained_vismodel_name_or_path is not None and pretrained_textmodel_name_or_path is not None
+        ), (
+            f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but"
             f"received `pretrained_model_name_or_path={pretrained_model_name_or_path}` and `pretrained_vismodel_name_or_path={pretrained_vismodel_name_or_path}`, "
             f"`pretrained_textmodel_name_or_path={pretrained_textmodel_name_or_path}`"
-            )
+        )
         config_dict = {}
         if pretrained_model_name_or_path is not None:
-            config_dict, kwargs = cls.get_config_dict(
-                pretrained_model_name_or_path, **kwargs)
+            config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-            if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                    config_dict["model_type"] != cls.model_type):
+            if (
+                "model_type" in config_dict
+                and hasattr(cls, "model_type")
+                and config_dict["model_type"] != cls.model_type
+            ):
                 logger.warning(
                     f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                     f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
                 )
         if pretrained_vismodel_name_or_path is not None:
-            visual_config_dict, kwargs = cls.get_config_dict(
-                pretrained_vismodel_name_or_path, **kwargs)
+            visual_config_dict, kwargs = cls.get_config_dict(pretrained_vismodel_name_or_path, **kwargs)
 
-            if ("model_type" in visual_config_dict and
-                    visual_config_dict["model_type"] !=
-                    "evavision_transformer"):
+            if "model_type" in visual_config_dict and visual_config_dict["model_type"] != "evavision_transformer":
                 logger.warning(
                     f"You are using a model of type {visual_config_dict['model_type']} to instantiate a model of type "
                     f"evavision_transformer. This is not supported for all configurations of models and can yield errors."
                 )
             config_dict["vision_cfg"] = visual_config_dict
         if pretrained_textmodel_name_or_path is not None:
-            text_config_dict, kwargs = cls.get_config_dict(
-                pretrained_textmodel_name_or_path, **kwargs)
+            text_config_dict, kwargs = cls.get_config_dict(pretrained_textmodel_name_or_path, **kwargs)
             config_dict["text_cfg"] = text_config_dict
 
-            if ("model_type" in text_config_dict and
-                    text_config_dict["model_type"] != "evatext_transformer"):
+            if "model_type" in text_config_dict and text_config_dict["model_type"] != "evatext_transformer":
                 logger.warning(
                     f"You are using a model of type {text_config_dict['model_type']} to instantiate a model of type "
                     f"evatext_transformer. This is not supported for all configurations of models and can yield errors."
@@ -111,21 +110,22 @@ class EVACLIPPretrainedModel(PretrainedModel):
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path=None,
-            pretrained_vismodel_name_or_path=None,
-            pretrained_textmodel_name_or_path=None,
-            from_hf_hub: bool=False,
-            subfolder: str=None,
-            *args,
-            **kwargs, ):
+        cls,
+        pretrained_model_name_or_path=None,
+        pretrained_vismodel_name_or_path=None,
+        pretrained_textmodel_name_or_path=None,
+        from_hf_hub: bool = False,
+        subfolder: str = None,
+        *args,
+        **kwargs,
+    ):
         assert pretrained_model_name_or_path is not None or (
-            pretrained_vismodel_name_or_path is not None and
-            pretrained_textmodel_name_or_path is not None
-        ), (f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but"
+            pretrained_vismodel_name_or_path is not None and pretrained_textmodel_name_or_path is not None
+        ), (
+            f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but"
             f"received `pretrained_model_name_or_path={pretrained_model_name_or_path}` and `pretrained_vismodel_name_or_path={pretrained_vismodel_name_or_path}`, "
             f"`pretrained_textmodel_name_or_path={pretrained_textmodel_name_or_path}`"
-            )
+        )
 
         if pretrained_model_name_or_path is not None:
             return super().from_pretrained(
@@ -133,7 +133,8 @@ def from_pretrained(
                 from_hf_hub=from_hf_hub,
                 subfolder=subfolder,
                 *args,
-                **kwargs, )
+                **kwargs,
+            )
         else:
             config_dict = {
                 "vision_cfg": pretrained_vismodel_name_or_path,
@@ -145,22 +146,21 @@ def from_pretrained(
 
 class EVACLIP(EVACLIPPretrainedModel):
     def __init__(
-            self,
-            config,
-            disable_text=False,
-            local_loss=False,
-            gather_with_grad=False,
-            cache_labels=True,
-            data_world_rank=0,
-            data_world_size=1,
-            enable_recompute=False, ):
+        self,
+        config,
+        disable_text=False,
+        local_loss=False,
+        gather_with_grad=False,
+        cache_labels=True,
+        data_world_rank=0,
+        data_world_size=1,
+        enable_recompute=False,
+    ):
         super().__init__(config)
         if isinstance(config.vision_config, str):
-            self.visual = EVAVisionTransformer.from_pretrained(
-                config.vision_config)
+            self.visual = EVAVisionTransformer.from_pretrained(config.vision_config)
             if not disable_text:
-                self.text = EVATextTransformer.from_pretrained(
-                    config.text_config)
+                self.text = EVATextTransformer.from_pretrained(config.text_config)
         else:
             vision_config = EVAVisionTransformerConfig(**config.vision_config)
             text_config = EVATextTransformerConfig(**config.text_config)
@@ -169,15 +169,16 @@ def __init__(
                 self.text = EVATextTransformer(text_config)
         init_data = paddle.ones(shape=[1]) * np.log(1 / 0.07)
         self.logit_scale = self.create_parameter(
-            shape=[1],
-            default_initializer=paddle.nn.initializer.Assign(init_data))
+            shape=[1], default_initializer=paddle.nn.initializer.Assign(init_data)
+        )
 
         self.loss = ClipLoss(
             local_loss=local_loss,
             gather_with_grad=gather_with_grad,
             cache_labels=cache_labels,
             rank=data_world_rank,
-            world_size=data_world_size, )
+            world_size=data_world_size,
+        )
 
         if enable_recompute:
             self.visual.set_grad_checkpointing(True)
@@ -185,12 +186,9 @@ def __init__(
                 self.text.set_grad_checkpointing(True)
 
     def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
-        self.visual.lock(
-            unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
 
-    def lock_text_tower(self,
-                        unlocked_layers: int=0,
-                        freeze_layer_norm: bool=True):
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
         self.text.lock(unlocked_layers, freeze_layer_norm)
 
     def set_grad_checkpointing(self, enable=True):
@@ -205,31 +203,28 @@ def clip_scale(self):
         share_buffer = self.logit_scale.clip(0, math.log(100))
         self.logit_scale.copy_(share_buffer, True)
 
-    def encode_image(self, image, normalize: bool=False):
+    def encode_image(self, image, normalize: bool = False):
         features = self.visual(image)
-        out = (paddle.nn.functional.normalize(
-            x=features, axis=-1) if normalize else features)
+        out = paddle.nn.functional.normalize(x=features, axis=-1) if normalize else features
         return out
 
-    def encode_text(self, text, text_features=None, normalize: bool=False):
+    def encode_text(self, text, text_features=None, normalize: bool = False):
         if text_features is not None:
             # directly use text_features if given
-            return (paddle.nn.functional.normalize(
-                x=text_features, axis=-1) if normalize else text_features)
+            return paddle.nn.functional.normalize(x=text_features, axis=-1) if normalize else text_features
         features = self.text(text)
-        return (paddle.nn.functional.normalize(
-            x=features, axis=-1) if normalize else features)
+        return paddle.nn.functional.normalize(x=features, axis=-1) if normalize else features
 
     def forward(self, image, input_ids, text_emb=None, skiploss=False):
         self.clip_scale()
         text = input_ids
         text_features = text_emb
         image_features = self.encode_image(image, normalize=True)
-        text_features = self.encode_text(
-            text, text_features=text_features, normalize=True)
+        text_features = self.encode_text(text, text_features=text_features, normalize=True)
         if skiploss:
             return image_features, text_features, self.logit_scale.exp()
 
         loss_itc, logits_per_image, logits_per_text, labels = self.loss(
-            (image_features, text_features, self.logit_scale.exp()))
+            (image_features, text_features, self.logit_scale.exp())
+        )
         return loss_itc, image_features, text_features, self.logit_scale.exp()
diff --git a/paddlemix/models/evaclip/eva_text_model.py b/paddlemix/models/evaclip/eva_text_model.py
index 7b314fd424cc8..77b8bc2207293 100644
--- a/paddlemix/models/evaclip/eva_text_model.py
+++ b/paddlemix/models/evaclip/eva_text_model.py
@@ -16,7 +16,7 @@
 import logging
 import math
 import os
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Optional, Union
 
 import paddle
 import paddle.distributed as dist
@@ -27,7 +27,7 @@
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 
-from .utils import params_normal_, to_2tuple
+from .utils import params_normal_
 
 try:
     from .modules.fusedln import FusedLayerNorm
@@ -37,7 +37,9 @@
     print("Warning, FusedLn module is not available, use LayerNorm instead.")
 try:
     from paddle.incubate.nn.memory_efficient_attention import (
-        LowerTriangularMask, memory_efficient_attention)
+        LowerTriangularMask,
+        memory_efficient_attention,
+    )
 except:
     print("Warning: import memory_efficient_attention error")
 
@@ -87,17 +89,18 @@ class MultiHeadAttention(paddle.nn.Layer):
     StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
 
     def __init__(
-            self,
-            embed_dim,
-            num_heads,
-            dropout=0.0,
-            kdim=None,
-            vdim=None,
-            need_weights=False,
-            weight_attr=None,
-            bias_attr=None,
-            fuse_attention_qkv=False,
-            num_partitions=1, ):
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+        weight_attr=None,
+        bias_attr=None,
+        fuse_attention_qkv=False,
+        num_partitions=1,
+    ):
         super(MultiHeadAttention, self).__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -108,8 +111,7 @@ def __init__(
         self.fuse_attention_qkv = fuse_attention_qkv
 
         self.head_dim = embed_dim // num_heads
-        assert (self.head_dim * num_heads == self.embed_dim
-                ), "embed_dim must be divisible by num_heads"
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
 
         assert self.num_heads % num_partitions == 0
         self.num_heads = self.num_heads // num_partitions
@@ -124,12 +126,14 @@ def __init__(
                     3 * embed_dim,
                     weight_attr=weight_attr,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
             else:
                 self.qkv_proj = paddle.nn.Linear(
                     embed_dim,
                     3 * embed_dim,
-                    weight_attr=weight_attr, )
+                    weight_attr=weight_attr,
+                )
         else:
             if dist.get_world_size() > 1:
                 self.q_proj = fleet.meta_parallel.ColumnParallelLinear(
@@ -137,34 +141,40 @@ def __init__(
                     embed_dim,
                     weight_attr=weight_attr,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
 
                 self.k_proj = fleet.meta_parallel.ColumnParallelLinear(
                     self.kdim,
                     embed_dim,
                     weight_attr=weight_attr,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
 
                 self.v_proj = fleet.meta_parallel.ColumnParallelLinear(
                     self.vdim,
                     embed_dim,
                     weight_attr=weight_attr,
                     has_bias=True,
-                    gather_output=False, )
+                    gather_output=False,
+                )
             else:
                 self.q_proj = paddle.nn.Linear(
                     embed_dim,
                     embed_dim,
-                    weight_attr=weight_attr, )
+                    weight_attr=weight_attr,
+                )
                 self.k_proj = paddle.nn.Linear(
                     self.kdim,
                     embed_dim,
-                    weight_attr=weight_attr, )
+                    weight_attr=weight_attr,
+                )
                 self.v_proj = paddle.nn.Linear(
                     self.vdim,
                     embed_dim,
-                    weight_attr=weight_attr, )
+                    weight_attr=weight_attr,
+                )
 
         if dist.get_world_size() > 1:
             self.out_proj = fleet.meta_parallel.RowParallelLinear(
@@ -172,12 +182,14 @@ def __init__(
                 embed_dim,
                 weight_attr=weight_attr,
                 has_bias=True,
-                input_is_parallel=True, )
+                input_is_parallel=True,
+            )
         else:
             self.out_proj = paddle.nn.Linear(
                 embed_dim,
                 embed_dim,
-                weight_attr=weight_attr, )
+                weight_attr=weight_attr,
+            )
 
     def _fuse_prepare_qkv(self, query):
         mix_layer = self.qkv_proj(query)
@@ -250,13 +262,7 @@ def gen_cache(self, key, value=None, type=Cache):
             # incremental_state with initial value, mainly for usage like UniLM
             return self.Cache(key, value)
 
-    def forward(self,
-                query,
-                key,
-                value,
-                attn_mask=None,
-                use_cache=False,
-                cache=None):
+    def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None):
         r"""
         Applies multi-head attention to map queries and a set of key-value pairs
         to outputs.
@@ -270,11 +276,9 @@ def forward(self,
             else:
                 q, k, v = self._prepare_qkv(query, key, value, use_cache, cache)
         else:
-            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
-                                               cache)
+            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache)
         # scale dot product attention
-        product = paddle.matmul(
-            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
+        product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
 
         if attn_mask is not None:
             # Support bool or int mask
@@ -288,7 +292,8 @@ def forward(self,
                     weights,
                     self.dropout,
                     training=self.training,
-                    mode="upscale_in_train", )
+                    mode="upscale_in_train",
+                )
 
         out = tensor.matmul(weights, v)
 
@@ -317,11 +322,10 @@ def forward(self, x: paddle.Tensor):
         output = paddle.nn.functional.layer_norm(
             x=x.astype(dtype="float32"),
             normalized_shape=self._normalized_shape,
-            weight=self.weight.astype(dtype="float32")
-            if self.weight is not None else None,
-            bias=self.bias.astype(dtype="float32")
-            if self.bias is not None else None,
-            epsilon=self._epsilon, )
+            weight=self.weight.astype(dtype="float32") if self.weight is not None else None,
+            bias=self.bias.astype(dtype="float32") if self.bias is not None else None,
+            epsilon=self._epsilon,
+        )
         return output.astype(dtype=x.dtype)
 
 
@@ -335,14 +339,15 @@ def forward(self, x: paddle.Tensor):
             normalized_shape=self._normalized_shape,
             weight=self.weight,
             bias=self.bias,
-            epsilon=self._epsilon, )
+            epsilon=self._epsilon,
+        )
         if isinstance(orig_type, paddle.dtype):
             dtype = orig_type
         elif isinstance(orig_type, str) and orig_type not in [
-                "cpu",
-                "cuda",
-                "ipu",
-                "xpu",
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
         ]:
             dtype = orig_type
         elif isinstance(orig_type, paddle.Tensor):
@@ -361,9 +366,7 @@ class LayerScale(paddle.nn.Layer):
     def __init__(self, dim, init_values=1e-05):
         super().__init__()
         init_data = init_values * paddle.ones(shape=[dim])
-        self.gamma = self.create_parameter(
-            shape=[dim],
-            default_initializer=paddle.nn.initializer.Assign(init_data))
+        self.gamma = self.create_parameter(shape=[dim], default_initializer=paddle.nn.initializer.Assign(init_data))
 
     def forward(self, x):
         return x * self.gamma
@@ -409,30 +412,28 @@ def forward(self, x):
 
 
 def _in_projection_packed(
-        q: paddle.Tensor,
-        k: paddle.Tensor,
-        v: paddle.Tensor,
-        w: paddle.Tensor,
-        b: Optional[paddle.Tensor]=None, ):
+    q: paddle.Tensor,
+    k: paddle.Tensor,
+    v: paddle.Tensor,
+    w: paddle.Tensor,
+    b: Optional[paddle.Tensor] = None,
+):
     """
     https://github.com/pytorch/pytorch/blob/db2a237763eb8693a20788be94f8c192e762baa8/torch/nn/functional.py#L4726
     """
     E = q.shape[-1]
     if k is v:
         if q is k:
-            return paddle.nn.functional.linear(
-                x=q, weight=w, bias=b).chunk(
-                    chunks=3, axis=-1)
+            return paddle.nn.functional.linear(x=q, weight=w, bias=b).chunk(chunks=3, axis=-1)
         else:
             w_q, w_kv = w.split([E, E * 2])
             if b is None:
                 b_q = b_kv = None
             else:
                 b_q, b_kv = b.split([E, E * 2])
-            return (paddle.nn.functional.linear(
-                x=q, weight=w_q, bias=b_q), ) + paddle.nn.functional.linear(
-                    x=k, weight=w_kv, bias=b_kv).chunk(
-                        chunks=2, axis=-1)
+            return (paddle.nn.functional.linear(x=q, weight=w_q, bias=b_q),) + paddle.nn.functional.linear(
+                x=k, weight=w_kv, bias=b_kv
+            ).chunk(chunks=2, axis=-1)
     else:
         w_q, w_k, w_v = w.chunk(chunks=3)
         if b is None:
@@ -440,12 +441,10 @@ def _in_projection_packed(
         else:
             b_q, b_k, b_v = b.chunk(chunks=3)
         return (
-            paddle.nn.functional.linear(
-                x=q, weight=w_q, bias=b_q),
-            paddle.nn.functional.linear(
-                x=k, weight=w_k, bias=b_k),
-            paddle.nn.functional.linear(
-                x=v, weight=w_v, bias=b_v), )
+            paddle.nn.functional.linear(x=q, weight=w_q, bias=b_q),
+            paddle.nn.functional.linear(x=k, weight=w_k, bias=b_k),
+            paddle.nn.functional.linear(x=v, weight=w_v, bias=b_v),
+        )
 
 
 def masked_fill(x, mask, value):
@@ -455,17 +454,18 @@ def masked_fill(x, mask, value):
 
 class Attention(paddle.nn.Layer):
     def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=True,
-            scaled_cosine=False,
-            scale_heads=False,
-            logit_scale_max=math.log(1.0 / 0.01),
-            attn_drop=0.0,
-            proj_drop=0.0,
-            xattn=False,
-            rope=False, ):
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        scaled_cosine=False,
+        scale_heads=False,
+        logit_scale_max=math.log(1.0 / 0.01),
+        attn_drop=0.0,
+        proj_drop=0.0,
+        xattn=False,
+        rope=False,
+    ):
         super().__init__()
         self.scaled_cosine = scaled_cosine
         self.scale_heads = scale_heads
@@ -482,19 +482,22 @@ def __init__(
         paddle.set_default_dtype(origin_dtype)
         self.in_proj_weight = self.create_parameter(
             shape=[dim, dim * 3],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
         if qkv_bias:
             init_data = paddle.zeros(shape=[dim * 3])
             self.in_proj_bias = self.create_parameter(
                 shape=[dim * 3],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
         else:
             self.in_proj_bias = None
         if self.scaled_cosine:
             init_data = paddle.log(x=10 * paddle.ones(shape=[num_heads, 1, 1]))
             self.logit_scale = self.create_parameter(
                 shape=[num_heads, 1, 1],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
         else:
             self.logit_scale = None
         self.attn_drop = paddle.nn.Dropout(p=attn_drop)
@@ -502,12 +505,14 @@ def __init__(
             init_data = paddle.ones(shape=[num_heads, 1, 1])
             self.head_scale = self.create_parameter(
                 shape=[num_heads, 1, 1],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
         else:
             self.head_scale = None
         if dist.get_world_size() > 1:
             self.out_proj = fleet.meta_parallel.ColumnParallelLinear(
-                dim, dim, weight_attr=None, has_bias=True, gather_output=True)
+                dim, dim, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             self.out_proj = paddle.nn.Linear(dim, dim)
         self.out_drop = paddle.nn.Dropout(p=proj_drop)
@@ -515,11 +520,11 @@ def __init__(
         self.xattn_drop = attn_drop
         self.rope = rope
 
-    def forward(self, x, attn_mask: Optional[paddle.Tensor]=None):
+    def forward(self, x, attn_mask: Optional[paddle.Tensor] = None):
         L, N, C = x.shape
-        q, k, v = paddle.nn.functional.linear(
-            x=x, weight=self.in_proj_weight, bias=self.in_proj_bias).chunk(
-                chunks=3, axis=-1)
+        q, k, v = paddle.nn.functional.linear(x=x, weight=self.in_proj_weight, bias=self.in_proj_bias).chunk(
+            chunks=3, axis=-1
+        )
         if self.xattn:
             x = q.reshape((L, N, self.num_heads, -1))
             perm_3 = list(range(x.ndim))
@@ -542,8 +547,8 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None):
                 v,
                 p=self.xattn_drop,
                 scale=self.scale if self.logit_scale is None else None,
-                attn_bias=LowerTriangularMask()
-                if attn_mask is not None else None, )
+                attn_bias=LowerTriangularMask() if attn_mask is not None else None,
+            )
         else:
             x = q.reshape((L, N * self.num_heads, -1))
             perm_6 = list(range(x.ndim))
@@ -566,11 +571,10 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None):
                 perm_9[-1] = -2
                 perm_9[-2] = -1
                 attn = paddle.bmm(
-                    x=paddle.nn.functional.normalize(
-                        x=q, axis=-1),
-                    y=x.transpose(perm=perm_9), )
-                logit_scale = paddle.clip(
-                    x=self.logit_scale, max=self.logit_scale_max).exp()
+                    x=paddle.nn.functional.normalize(x=q, axis=-1),
+                    y=x.transpose(perm=perm_9),
+                )
+                logit_scale = paddle.clip(x=self.logit_scale, max=self.logit_scale_max).exp()
                 attn = attn.reshape((N, self.num_heads, L, L)) * logit_scale
                 attn = attn.reshape((-1, L, L))
             else:
@@ -582,11 +586,9 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None):
                 attn = paddle.bmm(x=q, y=x.transpose(perm=perm_10))
             if attn_mask is not None:
                 if attn_mask.dtype == "bool":
-                    new_attn_mask = paddle.zeros_like(
-                        x=attn_mask).astype(q.dtype)
+                    new_attn_mask = paddle.zeros_like(x=attn_mask).astype(q.dtype)
                     # new_attn_mask.masked_fill_(attn_mask, float('-inf'))
-                    new_attn_mask = masked_fill(new_attn_mask, attn_mask,
-                                                float("-inf"))
+                    new_attn_mask = masked_fill(new_attn_mask, attn_mask, float("-inf"))
                     attn_mask = new_attn_mask
                 attn += attn_mask
             attn = paddle.nn.functional.softmax(attn, axis=-1)
@@ -609,16 +611,17 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None):
 
 class CustomAttention(paddle.nn.Layer):
     def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=True,
-            scaled_cosine=True,
-            scale_heads=False,
-            logit_scale_max=math.log(1.0 / 0.01),
-            attn_drop=0.0,
-            proj_drop=0.0,
-            xattn=False, ):
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        scaled_cosine=True,
+        scale_heads=False,
+        logit_scale_max=math.log(1.0 / 0.01),
+        attn_drop=0.0,
+        proj_drop=0.0,
+        xattn=False,
+    ):
         super().__init__()
         self.scaled_cosine = scaled_cosine
         self.scale_heads = scale_heads
@@ -635,19 +638,21 @@ def __init__(
         paddle.set_default_dtype(origin_dtype)
         self.in_proj_weight = self.create_parameter(
             shape=[dim, dim * 3],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
         if qkv_bias:
             self.in_proj_bias = self.create_parameter(
                 shape=[dim * 3],
-                default_initializer=paddle.nn.initializer.Assign(
-                    paddle.zeros(shape=[dim * 3])), )
+                default_initializer=paddle.nn.initializer.Assign(paddle.zeros(shape=[dim * 3])),
+            )
         else:
             self.in_proj_bias = None
         if self.scaled_cosine:
             init_data = paddle.log(x=10 * paddle.ones(shape=[num_heads, 1, 1]))
             self.logit_scale = self.create_parameter(
                 shape=[num_heads, 1, 1],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
         else:
             self.logit_scale = None
         self.attn_drop = paddle.nn.Dropout(p=attn_drop)
@@ -655,12 +660,14 @@ def __init__(
             init_data = paddle.ones(shape=[num_heads, 1, 1])
             self.head_scale = self.create_parameter(
                 shape=[num_heads, 1, 1],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
         else:
             self.head_scale = None
         if dist.get_world_size() > 1:
             self.out_proj = fleet.meta_parallel.ColumnParallelLinear(
-                dim, dim, weight_attr=None, has_bias=True, gather_output=True)
+                dim, dim, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             self.out_proj = paddle.nn.Linear(dim, dim)
         self.out_drop = paddle.nn.Dropout(p=proj_drop)
@@ -668,31 +675,28 @@ def __init__(
         self.xattn_drop = attn_drop
 
     def forward(
-            self,
-            query: paddle.Tensor,
-            key: paddle.Tensor,
-            value: paddle.Tensor,
-            attn_mask: Optional[paddle.Tensor]=None, ):
-        q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight,
-                                        self.in_proj_bias)
+        self,
+        query: paddle.Tensor,
+        key: paddle.Tensor,
+        value: paddle.Tensor,
+        attn_mask: Optional[paddle.Tensor] = None,
+    ):
+        q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias)
         N_q, B_q, C_q = q.shape
         N_k, B_k, C_k = k.shape
         N_v, B_v, C_v = v.shape
         if self.xattn:
-            q = q.transpose(perm=[1, 0, 2]).reshape(
-                (B_q, N_q, self.num_heads, -1))
-            k = k.transpose(perm=[1, 0, 2]).reshape(
-                (B_k, N_k, self.num_heads, -1))
-            v = v.transpose(perm=[1, 0, 2]).reshape(
-                (B_v, N_v, self.num_heads, -1))
+            q = q.transpose(perm=[1, 0, 2]).reshape((B_q, N_q, self.num_heads, -1))
+            k = k.transpose(perm=[1, 0, 2]).reshape((B_k, N_k, self.num_heads, -1))
+            v = v.transpose(perm=[1, 0, 2]).reshape((B_v, N_v, self.num_heads, -1))
             x = memory_efficient_attention(
                 q,
                 k,
                 v,
                 p=self.xattn_drop,
                 scale=self.scale if self.logit_scale is None else None,
-                attn_bias=LowerTriangularMask()
-                if attn_mask is not None else None, )
+                attn_bias=LowerTriangularMask() if attn_mask is not None else None,
+            )
 
         else:
             x = q.reshape((N_q, B_q * self.num_heads, -1))
@@ -716,13 +720,11 @@ def forward(
                 perm_15[-1] = -2
                 perm_15[-2] = -1
                 attn = paddle.bmm(
-                    x=paddle.nn.functional.normalize(
-                        x=q, axis=-1),
-                    y=x.transpose(perm=perm_15), )
-                logit_scale = paddle.clip(
-                    x=self.logit_scale, max=self.logit_scale_max).exp()
-                attn = attn.reshape(
-                    (B_q, self.num_heads, N_q, N_k)) * logit_scale
+                    x=paddle.nn.functional.normalize(x=q, axis=-1),
+                    y=x.transpose(perm=perm_15),
+                )
+                logit_scale = paddle.clip(x=self.logit_scale, max=self.logit_scale_max).exp()
+                attn = attn.reshape((B_q, self.num_heads, N_q, N_k)) * logit_scale
                 attn = attn.reshape((-1, N_q, N_k))
             else:
                 q = q * self.scale
@@ -733,10 +735,8 @@ def forward(
                 attn = paddle.bmm(x=q, y=x.transpose(perm=perm_16))
             if attn_mask is not None:
                 if attn_mask.dtype == "bool":
-                    new_attn_mask = paddle.zeros_like(
-                        x=attn_mask).astype(q.dtype)
-                    new_attn_mask = masked_fill(new_attn_mask, attn_mask,
-                                                float("-inf"))
+                    new_attn_mask = paddle.zeros_like(x=attn_mask).astype(q.dtype)
+                    new_attn_mask = masked_fill(new_attn_mask, attn_mask, float("-inf"))
                     attn_mask = new_attn_mask
                 attn += attn_mask
             attn = paddle.nn.functional.softmax(attn, axis=-1)
@@ -759,19 +759,20 @@ def forward(
 
 class CustomResidualAttentionBlock(paddle.nn.Layer):
     def __init__(
-            self,
-            d_model: int,
-            n_head: int,
-            mlp_ratio: float=4.0,
-            ls_init_value: float=None,
-            act_layer: Callable=paddle.nn.GELU,
-            norm_layer: Callable=LayerNorm,
-            scale_cosine_attn: bool=False,
-            scale_heads: bool=False,
-            scale_attn: bool=False,
-            scale_fc: bool=False,
-            cross_attn: bool=False,
-            xattn: bool=False, ):
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = paddle.nn.GELU,
+        norm_layer: Callable = LayerNorm,
+        scale_cosine_attn: bool = False,
+        scale_heads: bool = False,
+        scale_attn: bool = False,
+        scale_fc: bool = False,
+        cross_attn: bool = False,
+        xattn: bool = False,
+    ):
         super().__init__()
         self.ln_1 = norm_layer(d_model)
         self.ln_1_k = norm_layer(d_model) if cross_attn else self.ln_1
@@ -784,74 +785,74 @@ def __init__(
             proj_drop=0.0,
             scaled_cosine=scale_cosine_attn,
             scale_heads=scale_heads,
-            xattn=xattn, )
-        self.ln_attn = norm_layer(
-            d_model) if scale_attn else paddle.nn.Identity()
-        self.ls_1 = (LayerScale(d_model, ls_init_value)
-                     if ls_init_value is not None else paddle.nn.Identity())
+            xattn=xattn,
+        )
+        self.ln_attn = norm_layer(d_model) if scale_attn else paddle.nn.Identity()
+        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else paddle.nn.Identity()
         self.ln_2 = norm_layer(d_model)
         mlp_width = int(d_model * mlp_ratio)
         if dist.get_world_size() > 1:
-            self.mlp = paddle.nn.Sequential(* [
-                (
-                    "c_fc",
-                    fleet.meta_parallel.ColumnParallelLinear(
-                        d_model,
-                        mlp_width,
-                        weight_attr=None,
-                        has_bias=True,
-                        gather_output=True, ), ),
-                ("ln", norm_layer(mlp_width)
-                 if scale_fc else paddle.nn.Identity()),
-                ("gelu", act_layer()),
-                (
-                    "c_proj",
-                    fleet.meta_parallel.ColumnParallelLinear(
-                        mlp_width,
-                        d_model,
-                        weight_attr=None,
-                        has_bias=True,
-                        gather_output=True, ), ),
-            ])
+            self.mlp = paddle.nn.Sequential(
+                *[
+                    (
+                        "c_fc",
+                        fleet.meta_parallel.ColumnParallelLinear(
+                            d_model,
+                            mlp_width,
+                            weight_attr=None,
+                            has_bias=True,
+                            gather_output=True,
+                        ),
+                    ),
+                    ("ln", norm_layer(mlp_width) if scale_fc else paddle.nn.Identity()),
+                    ("gelu", act_layer()),
+                    (
+                        "c_proj",
+                        fleet.meta_parallel.ColumnParallelLinear(
+                            mlp_width,
+                            d_model,
+                            weight_attr=None,
+                            has_bias=True,
+                            gather_output=True,
+                        ),
+                    ),
+                ]
+            )
         else:
-            self.mlp = paddle.nn.Sequential(* [
-                ("c_fc", paddle.nn.Linear(d_model, mlp_width)),
-                ("ln", norm_layer(mlp_width)
-                 if scale_fc else paddle.nn.Identity()),
-                ("gelu", act_layer()),
-                ("c_proj", paddle.nn.Linear(mlp_width, d_model)),
-            ])
-        self.ls_2 = (LayerScale(d_model, ls_init_value)
-                     if ls_init_value is not None else paddle.nn.Identity())
+            self.mlp = paddle.nn.Sequential(
+                *[
+                    ("c_fc", paddle.nn.Linear(d_model, mlp_width)),
+                    ("ln", norm_layer(mlp_width) if scale_fc else paddle.nn.Identity()),
+                    ("gelu", act_layer()),
+                    ("c_proj", paddle.nn.Linear(mlp_width, d_model)),
+                ]
+            )
+        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else paddle.nn.Identity()
 
     def forward(
-            self,
-            q: paddle.Tensor,
-            k: paddle.Tensor,
-            v: paddle.Tensor,
-            attn_mask: Optional[paddle.Tensor]=None, ):
-        q = q + self.ls_1(
-            self.ln_attn(
-                self.attn(
-                    self.ln_1(q),
-                    self.ln_1_k(k),
-                    self.ln_1_v(v),
-                    attn_mask=attn_mask)))
+        self,
+        q: paddle.Tensor,
+        k: paddle.Tensor,
+        v: paddle.Tensor,
+        attn_mask: Optional[paddle.Tensor] = None,
+    ):
+        q = q + self.ls_1(self.ln_attn(self.attn(self.ln_1(q), self.ln_1_k(k), self.ln_1_v(v), attn_mask=attn_mask)))
         q = q + self.ls_2(self.mlp(self.ln_2(q)))
         return q
 
 
 class ResidualAttentionBlock(paddle.nn.Layer):
     def __init__(
-            self,
-            d_model: int,
-            n_head: int,
-            mlp_ratio: float=4.0,
-            ls_init_value: float=None,
-            act_layer: Callable=nn.GELU,
-            norm_layer: Callable=LayerNorm,
-            xattn: bool=False,
-            is_cross_attention: bool=False, ):
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = LayerNorm,
+        xattn: bool = False,
+        is_cross_attention: bool = False,
+    ):
         super().__init__()
 
         self.ln_1 = norm_layer(d_model)
@@ -859,57 +860,64 @@ def __init__(
             self.attn = Attention(d_model, n_head, xattn=True)
         else:
             self.attn = MultiHeadAttention(d_model, n_head)
-        self.ls_1 = (LayerScale(d_model, ls_init_value)
-                     if ls_init_value is not None else nn.Identity())
+        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
         if is_cross_attention:
             self.ln_1_kv = norm_layer(d_model)
 
         self.ln_2 = norm_layer(d_model)
         mlp_width = int(d_model * mlp_ratio)
         if dist.get_world_size() > 1:
-            self.mlp = paddle.nn.Sequential(* [
-                (
-                    "c_fc",
-                    fleet.meta_parallel.ColumnParallelLinear(
-                        d_model,
-                        mlp_width,
-                        weight_attr=None,
-                        has_bias=True,
-                        gather_output=True, ), ),
-                ("gelu", act_layer()),
-                (
-                    "c_proj",
-                    fleet.meta_parallel.ColumnParallelLinear(
-                        mlp_width,
-                        d_model,
-                        weight_attr=None,
-                        has_bias=True,
-                        gather_output=True, ), ),
-            ])
+            self.mlp = paddle.nn.Sequential(
+                *[
+                    (
+                        "c_fc",
+                        fleet.meta_parallel.ColumnParallelLinear(
+                            d_model,
+                            mlp_width,
+                            weight_attr=None,
+                            has_bias=True,
+                            gather_output=True,
+                        ),
+                    ),
+                    ("gelu", act_layer()),
+                    (
+                        "c_proj",
+                        fleet.meta_parallel.ColumnParallelLinear(
+                            mlp_width,
+                            d_model,
+                            weight_attr=None,
+                            has_bias=True,
+                            gather_output=True,
+                        ),
+                    ),
+                ]
+            )
         else:
-            self.mlp = paddle.nn.Sequential(* [
-                ("c_fc", paddle.nn.Linear(d_model, mlp_width)),
-                ("gelu", act_layer()),
-                ("c_proj", paddle.nn.Linear(mlp_width, d_model)),
-            ])
-        self.ls_2 = (LayerScale(d_model, ls_init_value)
-                     if ls_init_value is not None else nn.Identity())
+            self.mlp = paddle.nn.Sequential(
+                *[
+                    ("c_fc", paddle.nn.Linear(d_model, mlp_width)),
+                    ("gelu", act_layer()),
+                    ("c_proj", paddle.nn.Linear(mlp_width, d_model)),
+                ]
+            )
+        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
         self.xattn = xattn
 
     def attention(
-            self,
-            q_x,
-            k_x=None,
-            v_x=None,
-            attn_mask=None, ):
+        self,
+        q_x,
+        k_x=None,
+        v_x=None,
+        attn_mask=None,
+    ):
 
         if isinstance(q_x.dtype, paddle.dtype):
             dtype = q_x.dtype
         elif isinstance(q_x.dtype, str) and q_x.dtype not in [
-                "cpu",
-                "cuda",
-                "ipu",
-                "xpu",
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
         ]:
             dtype = q_x.dtype
         elif isinstance(q_x.dtype, paddle.Tensor):
@@ -920,8 +928,7 @@ def attention(
         if self.xattn:
             return self.attn(q_x, attn_mask=attn_mask)
 
-        attn_mask = (attn_mask.unsqueeze(0).unsqueeze(0)
-                     if attn_mask is not None else None)
+        attn_mask = attn_mask.unsqueeze(0).unsqueeze(0) if attn_mask is not None else None
         q_x = q_x.transpose((1, 0, 2))
         k_x = k_x if k_x is not None else q_x
         v_x = v_x if v_x is not None else q_x
@@ -929,19 +936,16 @@ def attention(
         return out.transpose((1, 0, 2))
 
     def forward(
-            self,
-            q_x,
-            k_x=None,
-            v_x=None,
-            attn_mask=None, ):
-        k_x = (self.ln_1_kv(k_x)
-               if hasattr(self, "ln_1_kv") and k_x is not None else None)
-        v_x = (self.ln_1_kv(v_x)
-               if hasattr(self, "ln_1_kv") and v_x is not None else None)
-
-        x = self.ls_1(
-            self.attention(
-                q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
+        self,
+        q_x,
+        k_x=None,
+        v_x=None,
+        attn_mask=None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+
+        x = self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
         x = x + q_x
         x = x + self.ls_2(self.mlp(self.ln_2(x)))
         return x
@@ -949,41 +953,44 @@ def forward(
 
 class Transformer(paddle.nn.Layer):
     def __init__(
-            self,
-            config,
-            act_layer: Callable=paddle.nn.GELU,
-            norm_layer: Callable=LayerNorm, ):
+        self,
+        config,
+        act_layer: Callable = paddle.nn.GELU,
+        norm_layer: Callable = LayerNorm,
+    ):
         super().__init__()
         self.enable_recompute = False
         self.width = config.width
         self.layers = config.layers
-        self.resblocks = paddle.nn.LayerList(sublayers=[
-            ResidualAttentionBlock(
-                config.width,
-                config.heads,
-                mlp_ratio=4.0,
-                ls_init_value=config.ls_init_value,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                xattn=config.xattn, ) for _ in range(config.layers)
-        ])
+        self.resblocks = paddle.nn.LayerList(
+            sublayers=[
+                ResidualAttentionBlock(
+                    config.width,
+                    config.heads,
+                    mlp_ratio=4.0,
+                    ls_init_value=config.ls_init_value,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    xattn=config.xattn,
+                )
+                for _ in range(config.layers)
+            ]
+        )
 
     def get_cast_dtype(self) -> paddle.dtype:
         return self.resblocks[0].mlp.c_fc.weight.dtype
 
-    def forward(self, x: paddle.Tensor,
-                attn_mask: Optional[paddle.Tensor]=None):
+    def forward(self, x: paddle.Tensor, attn_mask: Optional[paddle.Tensor] = None):
         for r in self.resblocks:
             if self.enable_recompute:
-                x = paddle.distributed.fleet.utils.recompute(
-                    r, x, attn_mask, use_reentrant=False)
+                x = paddle.distributed.fleet.utils.recompute(r, x, attn_mask, use_reentrant=False)
             else:
                 x = r(x, attn_mask=attn_mask)
         return x
 
 
 class AttentionalPooler(paddle.nn.Layer):
-    def __init__(self, config, norm_layer: Callable=LayerNorm):
+    def __init__(self, config, norm_layer: Callable = LayerNorm):
         super().__init__()
         d_model = config.num_classes
         context_dim = config.embed_dim
@@ -995,12 +1002,9 @@ def __init__(self, config, norm_layer: Callable=LayerNorm):
         paddle.set_default_dtype(origin_dtype)
         self.query = self.create_parameter(
             shape=[config.n_queries, d_model],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
-        self.attn = MultiHeadAttention(
-            d_model,
-            config.attn_pooler_heads,
-            kdim=context_dim,
-            vdim=context_dim)
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
+        self.attn = MultiHeadAttention(d_model, config.attn_pooler_heads, kdim=context_dim, vdim=context_dim)
         self.ln_q = norm_layer(d_model)
         self.ln_k = norm_layer(context_dim)
 
@@ -1020,21 +1024,22 @@ class EVATextTransformerConfig(PretrainedConfig):
     model_type = "evatext_transformer"
 
     def __init__(
-            self,
-            context_length: int=77,
-            vocab_size: int=49408,
-            width: int=512,
-            heads: int=8,
-            layers: int=12,
-            ls_init_value: float=None,
-            output_dim: int=512,
-            act_layer: Callable=paddle.nn.GELU,
-            norm_layer: Callable=LayerNorm,
-            xattn: bool=False,
-            attn_mask: bool=True,
-            pad_id: int=0,
-            quick_gelu: bool=False,
-            **kwargs, ):
+        self,
+        context_length: int = 77,
+        vocab_size: int = 49408,
+        width: int = 512,
+        heads: int = 8,
+        layers: int = 12,
+        ls_init_value: float = None,
+        output_dim: int = 512,
+        act_layer: Callable = paddle.nn.GELU,
+        norm_layer: Callable = LayerNorm,
+        xattn: bool = False,
+        attn_mask: bool = True,
+        pad_id: int = 0,
+        quick_gelu: bool = False,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -1053,14 +1058,10 @@ def __init__(
         self.quick_gelu = quick_gelu
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
-
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -1096,54 +1097,43 @@ def __init__(self, config: EVATextTransformerConfig):
         self.num_pos = config.context_length
         self.heads = config.heads
         if dist.get_world_size() > 1:
-            self.token_embedding = fleet.meta_parallel.VocabParallelEmbedding(
-                config.vocab_size, width)
+            self.token_embedding = fleet.meta_parallel.VocabParallelEmbedding(config.vocab_size, width)
         else:
             self.token_embedding = paddle.nn.Embedding(config.vocab_size, width)
-        self.transformer = Transformer(
-            config, act_layer=act_layer, norm_layer=norm_layer)
+        self.transformer = Transformer(config, act_layer=act_layer, norm_layer=norm_layer)
         self.ln_final = norm_layer(width)
         init_data = paddle.empty(shape=[width, self.output_dim])
         self.text_projection = self.create_parameter(
             shape=[width, self.output_dim],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
         init_data = paddle.empty(shape=[self.num_pos, width])
         self.positional_embedding = self.create_parameter(
             shape=[self.num_pos, width],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
         if config.attn_mask:
-            self.register_buffer(
-                "attn_mask", self.build_attention_mask(), persistable=False)
+            self.register_buffer("attn_mask", self.build_attention_mask(), persistable=False)
         else:
             self.attn_mask = None
         # self.init_parameters()
 
     def init_parameters(self):
-        self.token_embedding.weight = params_normal_(
-            self.token_embedding.weight, std=0.02)
-        self.positional_embedding = params_normal_(
-            self.positional_embedding, std=0.01)
+        self.token_embedding.weight = params_normal_(self.token_embedding.weight, std=0.02)
+        self.positional_embedding = params_normal_(self.positional_embedding, std=0.01)
 
-        proj_std = (self.transformer.width**-0.5 * (2 * self.transformer.layers)
-                    **-0.5)
+        proj_std = self.transformer.width**-0.5 * (2 * self.transformer.layers) ** -0.5
         attn_std = self.transformer.width**-0.5
-        fc_std = (2 * self.transformer.width)**-0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
         for block in self.transformer.resblocks:
-            block.attn.q_proj.weight = params_normal_(
-                block.attn.q_proj.weight, std=attn_std)
-            block.attn.k_proj.weight = params_normal_(
-                block.attn.k_proj.weight, std=attn_std)
-            block.attn.v_proj.weight = params_normal_(
-                block.attn.v_proj.weight, std=attn_std)
-            block.attn.out_proj.weight = params_normal_(
-                block.attn.out_proj.weight, std=proj_std)
-            block.mlp.c_fc.weight = params_normal_(
-                block.mlp.c_fc.weight, std=fc_std)
-            block.mlp.c_proj.weight = params_normal_(
-                block.mlp.c_proj.weight, std=proj_std)
+            block.attn.q_proj.weight = params_normal_(block.attn.q_proj.weight, std=attn_std)
+            block.attn.k_proj.weight = params_normal_(block.attn.k_proj.weight, std=attn_std)
+            block.attn.v_proj.weight = params_normal_(block.attn.v_proj.weight, std=attn_std)
+            block.attn.out_proj.weight = params_normal_(block.attn.out_proj.weight, std=proj_std)
+            block.mlp.c_fc.weight = params_normal_(block.mlp.c_fc.weight, std=fc_std)
+            block.mlp.c_proj.weight = params_normal_(block.mlp.c_proj.weight, std=proj_std)
         if self.text_projection is not None:
-            self.text_projection = params_normal_(
-                self.text_projection, std=self.transformer.width**-0.5)
+            self.text_projection = params_normal_(self.text_projection, std=self.transformer.width**-0.5)
 
     def set_grad_checkpointing(self, enable=True):
         self.transformer.enable_recompute = enable
@@ -1168,10 +1158,10 @@ def forward(self, text):
         if isinstance(cast_dtype, paddle.dtype):
             dtype = cast_dtype
         elif isinstance(cast_dtype, str) and cast_dtype not in [
-                "cpu",
-                "cuda",
-                "ipu",
-                "xpu",
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
         ]:
             dtype = cast_dtype
         elif isinstance(cast_dtype, paddle.Tensor):
@@ -1184,10 +1174,10 @@ def forward(self, text):
         if isinstance(cast_dtype, paddle.dtype):
             dtype = cast_dtype
         elif isinstance(cast_dtype, str) and cast_dtype not in [
-                "cpu",
-                "cuda",
-                "ipu",
-                "xpu",
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
         ]:
             dtype = cast_dtype
         elif isinstance(cast_dtype, paddle.Tensor):
@@ -1202,6 +1192,6 @@ def forward(self, text):
         pooled = x[paddle.arange(x.shape[0]), text.argmax(axis=-1)]
 
         if self.text_projection is not None:
-            pooled = pooled @self.text_projection
+            pooled = pooled @ self.text_projection
 
         return pooled
diff --git a/paddlemix/models/evaclip/eva_vit_model.py b/paddlemix/models/evaclip/eva_vit_model.py
index f829400ad921e..df2195370705e 100644
--- a/paddlemix/models/evaclip/eva_vit_model.py
+++ b/paddlemix/models/evaclip/eva_vit_model.py
@@ -31,8 +31,7 @@
 import paddle.distributed as dist
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from paddle.incubate.nn.memory_efficient_attention import \
-    memory_efficient_attention
+from paddle.incubate.nn.memory_efficient_attention import memory_efficient_attention
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 from paddlenlp.transformers.model_utils import PretrainedModel
 from paddlenlp.utils.log import logger
@@ -40,10 +39,7 @@
 from .utils import to_2tuple, trunc_normal_
 
 
-def drop_path(x,
-              drop_prob: float=0.0,
-              training: bool=False,
-              scale_by_keep: bool=True):
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
     This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
@@ -56,17 +52,14 @@ def drop_path(x,
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
-    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
-    bern_0 = (paddle.to_tensor(
-        [keep_prob], dtype=paddle.float32)
-              if not isinstance(keep_prob, paddle.Tensor) else keep_prob)
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    bern_0 = (
+        paddle.to_tensor([keep_prob], dtype=paddle.float32) if not isinstance(keep_prob, paddle.Tensor) else keep_prob
+    )
     random_tensor = paddle.assign(
-        paddle.bernoulli(
-            paddle.broadcast_to(
-                bern_0, paddle.empty(
-                    shape=shape, dtype=x.dtype).shape)),
-        paddle.empty(
-            shape=shape, dtype=x.dtype), )
+        paddle.bernoulli(paddle.broadcast_to(bern_0, paddle.empty(shape=shape, dtype=x.dtype).shape)),
+        paddle.empty(shape=shape, dtype=x.dtype),
+    )
     if keep_prob > 0.0 and scale_by_keep:
         random_tensor = random_tensor.divide(keep_prob)
     return x * random_tensor
@@ -89,10 +82,7 @@ def extra_repr(self) -> str:
 
 
 class Mlp(paddle.nn.Layer):
-    def __init__(self,
-                 config,
-                 act_layer=paddle.nn.GELU,
-                 norm_layer=paddle.nn.LayerNorm):
+    def __init__(self, config, act_layer=paddle.nn.GELU, norm_layer=paddle.nn.LayerNorm):
         super().__init__()
         in_features = config.embed_dim
         hidden_features = int(config.embed_dim * config.mlp_ratio)
@@ -104,19 +94,20 @@ def __init__(self,
                 hidden_features,
                 weight_attr=None,
                 has_bias=True,
-                gather_output=True, )
+                gather_output=True,
+            )
             self.fc2 = fleet.meta_parallel.ColumnParallelLinear(
                 hidden_features,
                 out_features,
                 weight_attr=None,
                 has_bias=True,
-                gather_output=True, )
+                gather_output=True,
+            )
         else:
             self.fc1 = paddle.nn.Linear(in_features, hidden_features)
             self.fc2 = paddle.nn.Linear(hidden_features, out_features)
         self.act = act_layer()
-        self.ffn_ln = (norm_layer(hidden_features)
-                       if config.subln else paddle.nn.Identity())
+        self.ffn_ln = norm_layer(hidden_features) if config.subln else paddle.nn.Identity()
         self.drop = paddle.nn.Dropout(p=config.drop_rate)
 
     def forward(self, x):
@@ -131,11 +122,12 @@ def forward(self, x):
 
 class SwiGLU(paddle.nn.Layer):
     def __init__(
-            self,
-            config,
-            drop=0.0,
-            act_layer=paddle.nn.Silu,
-            norm_layer=paddle.nn.LayerNorm, ):
+        self,
+        config,
+        drop=0.0,
+        act_layer=paddle.nn.Silu,
+        norm_layer=paddle.nn.LayerNorm,
+    ):
         super().__init__()
         in_features = config.embed_dim
         hidden_features = int(config.embed_dim * config.mlp_ratio)
@@ -146,26 +138,28 @@ def __init__(
                 hidden_features,
                 weight_attr=None,
                 has_bias=True,
-                gather_output=True, )
+                gather_output=True,
+            )
             self.w2 = fleet.meta_parallel.ColumnParallelLinear(
                 in_features,
                 hidden_features,
                 weight_attr=None,
                 has_bias=True,
-                gather_output=True, )
+                gather_output=True,
+            )
             self.w3 = fleet.meta_parallel.ColumnParallelLinear(
                 hidden_features,
                 out_features,
                 weight_attr=None,
                 has_bias=True,
-                gather_output=True, )
+                gather_output=True,
+            )
         else:
             self.w1 = paddle.nn.Linear(in_features, hidden_features)
             self.w2 = paddle.nn.Linear(in_features, hidden_features)
             self.w3 = paddle.nn.Linear(hidden_features, out_features)
         self.act = act_layer()
-        self.ffn_ln = (norm_layer(hidden_features)
-                       if config.subln else paddle.nn.Identity())
+        self.ffn_ln = norm_layer(hidden_features) if config.subln else paddle.nn.Identity()
         self.drop = paddle.nn.Dropout(p=drop)
 
     def forward(self, x):
@@ -180,11 +174,7 @@ def forward(self, x):
 
 
 class Attention(paddle.nn.Layer):
-    def __init__(self,
-                 config,
-                 window_size=None,
-                 rope=None,
-                 norm_layer=paddle.nn.LayerNorm):
+    def __init__(self, config, window_size=None, rope=None, norm_layer=paddle.nn.LayerNorm):
         super().__init__()
         dim = config.embed_dim
         self.xattn_drop = config.attn_drop_rate
@@ -193,8 +183,7 @@ def __init__(self,
 
         self.num_heads = config.embed_dim // config.head_width
         head_dim = dim // self.num_heads
-        if hasattr(config,
-                   "attn_head_dim") and config.attn_head_dim is not None:
+        if hasattr(config, "attn_head_dim") and config.attn_head_dim is not None:
             head_dim = config.attn_head_dim
         all_head_dim = head_dim * self.num_heads
         self.scale = config.qk_scale or head_dim**-0.5
@@ -205,23 +194,25 @@ def __init__(self,
                     all_head_dim,
                     weight_attr=None,
                     has_bias=config.qkv_bias,
-                    gather_output=True, )
+                    gather_output=True,
+                )
                 self.k_proj = fleet.meta_parallel.ColumnParallelLinear(
                     dim,
                     all_head_dim,
                     weight_attr=None,
                     has_bias=False,
-                    gather_output=True, )
+                    gather_output=True,
+                )
                 self.v_proj = fleet.meta_parallel.ColumnParallelLinear(
                     dim,
                     all_head_dim,
                     weight_attr=None,
                     has_bias=config.qkv_bias,
-                    gather_output=True, )
+                    gather_output=True,
+                )
             else:
                 self.q_proj = paddle.nn.Linear(dim, all_head_dim)
-                self.k_proj = paddle.nn.Linear(
-                    dim, all_head_dim, bias_attr=False)
+                self.k_proj = paddle.nn.Linear(dim, all_head_dim, bias_attr=False)
                 self.v_proj = paddle.nn.Linear(dim, all_head_dim)
         else:
             if dist.get_world_size() > 1:
@@ -230,15 +221,14 @@ def __init__(self,
                     all_head_dim * 3,
                     weight_attr=None,
                     has_bias=False,
-                    gather_output=True, )
+                    gather_output=True,
+                )
             else:
-                self.qkv = paddle.nn.Linear(
-                    dim, all_head_dim * 3, bias_attr=False)
+                self.qkv = paddle.nn.Linear(dim, all_head_dim * 3, bias_attr=False)
             if config.qkv_bias:
                 mpsize = 1
                 if dist.get_world_size() > 1:
-                    mpsize = (fleet.get_hybrid_communicate_group()
-                              .get_model_parallel_world_size())
+                    mpsize = fleet.get_hybrid_communicate_group().get_model_parallel_world_size()
                 init_data = paddle.zeros(shape=[all_head_dim // mpsize])
                 self.q_bias = self.create_parameter(
                     shape=[all_head_dim // mpsize],
@@ -253,47 +243,42 @@ def __init__(self,
                 self.v_bias = None
         if window_size:
             self.window_size = window_size
-            self.num_relative_distance = (2 * window_size[0] - 1) * (
-                2 * window_size[1] - 1) + 3
-            init_data = paddle.zeros(
-                shape=[self.num_relative_distance, self.num_heads])
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            init_data = paddle.zeros(shape=[self.num_relative_distance, self.num_heads])
             self.relative_position_bias_table = self.create_parameter(
                 shape=[self.num_relative_distance, self.num_heads],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
             coords_h = paddle.arange(end=window_size[0])
             coords_w = paddle.arange(end=window_size[1])
             coords = paddle.stack(x=paddle.meshgrid([coords_h, coords_w]))
             coords_flatten = paddle.flatten(x=coords, start_axis=1)
-            relative_coords = (
-                coords_flatten[:, :, (None)] - coords_flatten[:, (None), :])
+            relative_coords = coords_flatten[:, :, (None)] - coords_flatten[:, (None), :]
             relative_coords = relative_coords.transpose(perm=[1, 2, 0])
             relative_coords[:, :, (0)] += window_size[0] - 1
             relative_coords[:, :, (1)] += window_size[1] - 1
             relative_coords[:, :, (0)] *= 2 * window_size[1] - 1
             relative_position_index = paddle.zeros(
-                shape=(window_size[0] * window_size[1] + 1, ) * 2,
-                dtype=relative_coords.dtype, )
+                shape=(window_size[0] * window_size[1] + 1,) * 2,
+                dtype=relative_coords.dtype,
+            )
             relative_position_index[1:, 1:] = relative_coords.sum(axis=-1)
             relative_position_index[(0), 0:] = self.num_relative_distance - 3
             relative_position_index[0:, (0)] = self.num_relative_distance - 2
             relative_position_index[0, 0] = self.num_relative_distance - 1
-            self.register_buffer("relative_position_index",
-                                 relative_position_index)
+            self.register_buffer("relative_position_index", relative_position_index)
         else:
             self.window_size = None
             self.relative_position_bias_table = None
             self.relative_position_index = None
         self.attn_drop = paddle.nn.Dropout(p=self.xattn_drop)
-        self.inner_attn_ln = (norm_layer(all_head_dim)
-                              if (config.subln and config.inner_attn_ln) else
-                              paddle.nn.Identity())
+        self.inner_attn_ln = (
+            norm_layer(all_head_dim) if (config.subln and config.inner_attn_ln) else paddle.nn.Identity()
+        )
         if dist.get_world_size() > 1:
             self.proj = fleet.meta_parallel.ColumnParallelLinear(
-                all_head_dim,
-                dim,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True)
+                all_head_dim, dim, weight_attr=None, has_bias=True, gather_output=True
+            )
         else:
             self.proj = paddle.nn.Linear(all_head_dim, dim)
         self.proj_drop = paddle.nn.Dropout(p=config.drop_rate)
@@ -306,46 +291,37 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None):
             k = self.k_proj(x)
             v = self.v_proj(x)
 
-            q = q.reshape(
-                (B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3])
-            k = k.reshape(
-                (B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3])
-            v = v.reshape(
-                (B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3])
+            q = q.reshape((B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3])
+            k = k.reshape((B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3])
+            v = v.reshape((B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3])
         else:
             qkv_bias = None
             if self.q_bias is not None:
                 out_0 = paddle.zeros_like(x=self.v_bias)
                 out_0.stop_gradient = not False
                 qkv_bias = paddle.concat(x=(self.q_bias, out_0, self.v_bias))
-            qkv = paddle.nn.functional.linear(
-                x=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = paddle.nn.functional.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
 
             if dist.get_world_size() > 1:
                 hcg = fleet.get_hybrid_communicate_group()
                 if hcg.get_model_parallel_world_size() > 1:
                     model_parallel_group = hcg.get_model_parallel_group()
-                    qkv = paddle.distributed.collective._c_concat(
-                        qkv, group=model_parallel_group)
+                    qkv = paddle.distributed.collective._c_concat(qkv, group=model_parallel_group)
 
-            qkv = qkv.reshape(
-                (B, N, 3, self.num_heads, -1)).transpose(perm=[2, 0, 3, 1, 4])
+            qkv = qkv.reshape((B, N, 3, self.num_heads, -1)).transpose(perm=[2, 0, 3, 1, 4])
             q, k, v = qkv[0], qkv[1], qkv[2]
         if self.rope:
             q_t = q[:, :, 1:, :]
             ro_q_t = self.rope(q_t)
-            q = paddle.concat(
-                x=(q[:, :, :1, :], ro_q_t), axis=-2).astype(dtype=v.dtype)
+            q = paddle.concat(x=(q[:, :, :1, :], ro_q_t), axis=-2).astype(dtype=v.dtype)
             k_t = k[:, :, 1:, :]
             ro_k_t = self.rope(k_t)
-            k = paddle.concat(
-                x=(k[:, :, :1, :], ro_k_t), axis=-2).astype(dtype=v.dtype)
+            k = paddle.concat(x=(k[:, :, :1, :], ro_k_t), axis=-2).astype(dtype=v.dtype)
         if self.xattn:
             q = q.transpose(perm=[0, 2, 1, 3])
             k = k.transpose(perm=[0, 2, 1, 3])
             v = v.transpose(perm=[0, 2, 1, 3])
-            x = memory_efficient_attention(
-                q, k, v, p=self.xattn_drop, scale=self.scale)
+            x = memory_efficient_attention(q, k, v, p=self.xattn_drop, scale=self.scale)
             x = x.reshape((B, N, -1))
             x = self.inner_attn_ln(x)
             x = self.proj(x)
@@ -357,27 +333,28 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None):
             perm_0 = list(range(x.ndim))
             perm_0[-2] = x.ndim - 1
             perm_0[-1] = x.ndim - 2
-            attn = q @x.transpose(perm=perm_0)
+            attn = q @ x.transpose(perm=perm_0)
             if self.relative_position_bias_table is not None:
                 relative_position_bias = self.relative_position_bias_table[
-                    self.relative_position_index.reshape((-1))].reshape((
+                    self.relative_position_index.reshape((-1))
+                ].reshape(
+                    (
                         self.window_size[0] * self.window_size[1] + 1,
                         self.window_size[0] * self.window_size[1] + 1,
-                        -1, ))
-                relative_position_bias = relative_position_bias.transpose(
-                    perm=[2, 0, 1])
-                attn = attn + relative_position_bias.unsqueeze(axis=0).astype(
-                    dtype=attn.dtype)
+                        -1,
+                    )
+                )
+                relative_position_bias = relative_position_bias.transpose(perm=[2, 0, 1])
+                attn = attn + relative_position_bias.unsqueeze(axis=0).astype(dtype=attn.dtype)
             if rel_pos_bias is not None:
                 attn = attn + rel_pos_bias.astype(dtype=attn.dtype)
             if attn_mask is not None:
                 attn_mask = attn_mask.astype(dtype="bool")
-                attn = paddle.where(~attn_mask[:, (None), (None), :], attn,
-                                    float("-inf"))
+                attn = paddle.where(~attn_mask[:, (None), (None), :], attn, float("-inf"))
             attn = paddle.nn.functional.softmax(attn, axis=-1)
             with get_rng_state_tracker().rng_state("global_seed"):
                 attn = self.attn_drop(attn)
-            x = attn @v
+            x = attn @ v
             perm_1 = list(range(x.ndim))
             perm_1[1] = 2
             perm_1[2] = 1
@@ -391,13 +368,14 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None):
 
 class Block(paddle.nn.Layer):
     def __init__(
-            self,
-            config,
-            drop_path=0.0,
-            window_size=None,
-            rope=None,
-            act_layer=paddle.nn.GELU,
-            norm_layer=paddle.nn.LayerNorm, ):
+        self,
+        config,
+        drop_path=0.0,
+        window_size=None,
+        rope=None,
+        act_layer=paddle.nn.GELU,
+        norm_layer=paddle.nn.LayerNorm,
+    ):
         super().__init__()
         dim = config.embed_dim
         init_values = config.init_values
@@ -405,8 +383,7 @@ def __init__(
 
         self.norm1 = norm_layer(dim)
         self.attn = Attention(config, window_size=window_size, rope=rope)
-        self.drop_path = (DropPath(drop_path)
-                          if drop_path > 0.0 else paddle.nn.Identity())
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else paddle.nn.Identity()
         self.norm2 = norm_layer(dim)
         if config.naiveswiglu:
             self.mlp = SwiGLU(config, norm_layer=norm_layer)
@@ -415,38 +392,32 @@ def __init__(
         if init_values is not None and init_values > 0:
             init_data = init_values * paddle.ones(shape=dim)
             self.gamma_1 = self.create_parameter(
-                shape=dim,
-                default_initializer=paddle.nn.initializer.Assign(init_data))
+                shape=dim, default_initializer=paddle.nn.initializer.Assign(init_data)
+            )
             init_data = init_values * paddle.ones(shape=dim)
             self.gamma_2 = self.create_parameter(
-                shape=dim,
-                default_initializer=paddle.nn.initializer.Assign(init_data))
+                shape=dim, default_initializer=paddle.nn.initializer.Assign(init_data)
+            )
         else:
             self.gamma_1, self.gamma_2 = None, None
 
     def forward(self, x, rel_pos_bias=None, attn_mask=None):
         if self.gamma_1 is None:
             if self.postnorm:
-                x = x + self.drop_path(
-                    self.norm1(
-                        self.attn(
-                            x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
+                x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
                 x = x + self.drop_path(self.norm2(self.mlp(x)))
             else:
-                x = x + self.drop_path(
-                    self.attn(
-                        self.norm1(x),
-                        rel_pos_bias=rel_pos_bias,
-                        attn_mask=attn_mask))
+                x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
                 x = x + self.drop_path(self.mlp(self.norm2(x)))
         elif self.postnorm:
-            x = x + self.drop_path(self.gamma_1 * self.norm1(
-                self.attn(
-                    x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
+            x = x + self.drop_path(
+                self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
+            )
             x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
         else:
-            x = x + self.drop_path(self.gamma_1 * self.attn(
-                self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
+            x = x + self.drop_path(
+                self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+            )
             x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
         return x
 
@@ -458,10 +429,8 @@ def __init__(self, config):
         super().__init__()
         img_size = to_2tuple(config.img_size)
         patch_size = to_2tuple(config.patch_size)
-        num_patches = img_size[1] // patch_size[1] * (img_size[0] //
-                                                      patch_size[0])
-        self.patch_shape = img_size[0] // patch_size[0], img_size[
-            1] // patch_size[1]
+        num_patches = img_size[1] // patch_size[1] * (img_size[0] // patch_size[0])
+        self.patch_shape = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
         self.img_size = img_size
         self.patch_size = patch_size
         self.num_patches = num_patches
@@ -469,7 +438,8 @@ def __init__(self, config):
             in_channels=config.in_chans,
             out_channels=config.embed_dim,
             kernel_size=patch_size,
-            stride=patch_size, )
+            stride=patch_size,
+        )
 
     def forward(self, x, **kwargs):
         B, C, H, W = x.shape
@@ -488,25 +458,25 @@ class RelativePositionBias(paddle.nn.Layer):
     def __init__(self, window_size, num_heads):
         super().__init__()
         self.window_size = window_size
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1) + 3
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
         init_data = paddle.zeros(shape=[self.num_relative_distance, num_heads])
         self.relative_position_bias_table = self.create_parameter(
             shape=[self.num_relative_distance, num_heads],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
         coords_h = paddle.arange(end=window_size[0])
         coords_w = paddle.arange(end=window_size[1])
         coords = paddle.stack(x=paddle.meshgrid([coords_h, coords_w]))
         coords_flatten = paddle.flatten(x=coords, start_axis=1)
-        relative_coords = coords_flatten[:, :, (None)] - coords_flatten[:, (
-            None), :]
+        relative_coords = coords_flatten[:, :, (None)] - coords_flatten[:, (None), :]
         relative_coords = relative_coords.transpose(perm=[1, 2, 0])
         relative_coords[:, :, (0)] += window_size[0] - 1
         relative_coords[:, :, (1)] += window_size[1] - 1
         relative_coords[:, :, (0)] *= 2 * window_size[1] - 1
         relative_position_index = paddle.zeros(
-            shape=(window_size[0] * window_size[1] + 1, ) * 2,
-            dtype=relative_coords.dtype, )
+            shape=(window_size[0] * window_size[1] + 1,) * 2,
+            dtype=relative_coords.dtype,
+        )
         relative_position_index[1:, 1:] = relative_coords.sum(axis=-1)
         relative_position_index[(0), 0:] = self.num_relative_distance - 3
         relative_position_index[0:, (0)] = self.num_relative_distance - 2
@@ -514,11 +484,13 @@ def __init__(self, window_size, num_heads):
         self.register_buffer("relative_position_index", relative_position_index)
 
     def forward(self):
-        relative_position_bias = self.relative_position_bias_table[
-            self.relative_position_index.reshape((-1))].reshape((
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.reshape((-1))].reshape(
+            (
                 self.window_size[0] * self.window_size[1] + 1,
                 self.window_size[0] * self.window_size[1] + 1,
-                -1, ))
+                -1,
+            )
+        )
         return relative_position_bias.transpose(perm=[2, 0, 1])
 
 
@@ -528,42 +500,43 @@ class EVAVisionTransformerConfig(PretrainedConfig):
     attribute_map: Dict[str, str] = {}
 
     def __init__(
-            self,
-            img_size=224,
-            patch_size=16,
-            in_chans=3,
-            num_classes=1000,
-            embed_dim=768,
-            depth=12,
-            num_heads=8,
-            mlp_ratio=4.0,
-            qkv_bias=False,
-            qk_scale=None,
-            drop_rate=0.0,
-            attn_drop_rate=0.0,
-            drop_path_rate=0.0,
-            init_values=None,
-            patch_dropout=0.0,
-            use_abs_pos_emb=True,
-            use_rel_pos_bias=False,
-            use_shared_rel_pos_bias=False,
-            rope=False,
-            use_mean_pooling=True,
-            attentional_pool=False,
-            n_queries: int=256,
-            attn_pooler_heads: int=8,
-            init_scale=0.001,
-            enable_recompute=False,
-            xattn=False,
-            postnorm=False,
-            pt_hw_seq_len=16,
-            intp_freq=False,
-            naiveswiglu=False,
-            subln=False,
-            output_tokens=False,
-            fusedLN=False,
-            inner_attn_ln=True,
-            **kwargs, ):
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=8,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        init_values=None,
+        patch_dropout=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        rope=False,
+        use_mean_pooling=True,
+        attentional_pool=False,
+        n_queries: int = 256,
+        attn_pooler_heads: int = 8,
+        init_scale=0.001,
+        enable_recompute=False,
+        xattn=False,
+        postnorm=False,
+        pt_hw_seq_len=16,
+        intp_freq=False,
+        naiveswiglu=False,
+        subln=False,
+        output_tokens=False,
+        fusedLN=False,
+        inner_attn_ln=True,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
         self.img_size = img_size
@@ -602,14 +575,10 @@ def __init__(
         self.inner_attn_ln = inner_attn_ln
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
-
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -644,9 +613,7 @@ def __init__(self, config: EVAVisionTransformerConfig):
         self.embed_dim = embed_dim = config.embed_dim
         self.naiveswiglu = config.naiveswiglu
         use_mean_pooling = config.use_mean_pooling
-        norm_layer = (partial(
-            FusedLayerNorm, epsilon=1e-6) if config.fusedLN else partial(
-                LayerNorm, epsilon=1e-6))
+        norm_layer = partial(FusedLayerNorm, epsilon=1e-6) if config.fusedLN else partial(LayerNorm, epsilon=1e-6)
         num_heads = config.embed_dim // config.head_width
 
         self.patch_embed = PatchEmbed(config)
@@ -654,18 +621,19 @@ def __init__(self, config: EVAVisionTransformerConfig):
         init_data = paddle.zeros(shape=[1, 1, embed_dim])
         self.cls_token = self.create_parameter(
             shape=[1, 1, embed_dim],
-            default_initializer=paddle.nn.initializer.Assign(init_data), )
+            default_initializer=paddle.nn.initializer.Assign(init_data),
+        )
         if config.use_abs_pos_emb:
             init_data = paddle.zeros(shape=[1, num_patches + 1, embed_dim])
             self.pos_embed = self.create_parameter(
                 shape=[1, num_patches + 1, embed_dim],
-                default_initializer=paddle.nn.initializer.Assign(init_data), )
+                default_initializer=paddle.nn.initializer.Assign(init_data),
+            )
         else:
             self.pos_embed = None
         self.pos_drop = paddle.nn.Dropout(p=config.drop_rate)
         if config.use_shared_rel_pos_bias:
-            self.rel_pos_bias = RelativePositionBias(
-                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
         else:
             self.rel_pos_bias = None
         if config.rope:
@@ -674,55 +642,59 @@ def __init__(self, config: EVAVisionTransformerConfig):
             self.rope = VisionRotaryEmbeddingFast(
                 dim=half_head_dim,
                 pt_seq_len=config.pt_hw_seq_len,
-                ft_seq_len=hw_seq_len if config.intp_freq else None, )
+                ft_seq_len=hw_seq_len if config.intp_freq else None,
+            )
         else:
             self.rope = None
-        dpr = [
-            x.item()
-            for x in paddle.linspace(
-                start=0, stop=config.drop_path_rate, num=config.depth)
-        ]
-        self.blocks = paddle.nn.LayerList(sublayers=[
-            Block(
-                config,
-                drop_path=dpr[i],
-                norm_layer=norm_layer,
-                window_size=self.patch_embed.patch_shape
-                if config.use_rel_pos_bias else None,
-                rope=self.rope, ) for i in range(config.depth)
-        ])
+        dpr = [x.item() for x in paddle.linspace(start=0, stop=config.drop_path_rate, num=config.depth)]
+        self.blocks = paddle.nn.LayerList(
+            sublayers=[
+                Block(
+                    config,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    window_size=self.patch_embed.patch_shape if config.use_rel_pos_bias else None,
+                    rope=self.rope,
+                )
+                for i in range(config.depth)
+            ]
+        )
         if config.attentional_pool:
             self.attn_pool = AttentionalPooler(config)
-            self.norm = (paddle.nn.Identity()
-                         if use_mean_pooling else norm_layer(num_classes))
+            self.norm = paddle.nn.Identity() if use_mean_pooling else norm_layer(num_classes)
             self.fc_norm = norm_layer(num_classes) if use_mean_pooling else None
             if dist.get_world_size() > 1:
-                self.head = (fleet.meta_parallel.ColumnParallelLinear(
-                    num_classes,
-                    num_classes,
-                    weight_attr=None,
-                    has_bias=True,
-                    gather_output=True, )
-                             if num_classes > 0 else paddle.nn.Identity())
+                self.head = (
+                    fleet.meta_parallel.ColumnParallelLinear(
+                        num_classes,
+                        num_classes,
+                        weight_attr=None,
+                        has_bias=True,
+                        gather_output=True,
+                    )
+                    if num_classes > 0
+                    else paddle.nn.Identity()
+                )
             else:
-                self.head = (paddle.nn.Linear(num_classes, num_classes)
-                             if num_classes > 0 else paddle.nn.Identity())
+                self.head = paddle.nn.Linear(num_classes, num_classes) if num_classes > 0 else paddle.nn.Identity()
         else:
             self.attn_pool = None
-            self.norm = (paddle.nn.Identity()
-                         if use_mean_pooling else norm_layer(embed_dim))
+            self.norm = paddle.nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
             self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
             if dist.get_world_size() > 1:
-                self.head = (fleet.meta_parallel.ColumnParallelLinear(
-                    embed_dim,
-                    num_classes,
-                    weight_attr=None,
-                    has_bias=True,
-                    gather_output=True, )
-                             if num_classes > 0 else paddle.nn.Identity())
+                self.head = (
+                    fleet.meta_parallel.ColumnParallelLinear(
+                        embed_dim,
+                        num_classes,
+                        weight_attr=None,
+                        has_bias=True,
+                        gather_output=True,
+                    )
+                    if num_classes > 0
+                    else paddle.nn.Identity()
+                )
             else:
-                self.head = (paddle.nn.Linear(embed_dim, num_classes)
-                             if num_classes > 0 else paddle.nn.Identity())
+                self.head = paddle.nn.Linear(embed_dim, num_classes) if num_classes > 0 else paddle.nn.Identity()
         if self.pos_embed is not None:
             trunc_normal_(self.pos_embed, std=0.02)
         trunc_normal_(self.cls_token, std=0.02)
@@ -731,13 +703,9 @@ def __init__(self, config: EVAVisionTransformerConfig):
         if isinstance(self.head, fleet.meta_parallel.ColumnParallelLinear):
             trunc_normal_(self.head.weight, std=0.02)
             with paddle.no_grad():
-                self.head.weight.set_value(
-                    self.head.weight.scale(scale=config.init_scale))
-                self.head.bias.set_value(
-                    self.head.bias.scale(scale=config.init_scale))
-        self.patch_dropout = (PatchDropout(config.patch_dropout)
-                              if config.patch_dropout > 0.0 else
-                              paddle.nn.Identity())
+                self.head.weight.set_value(self.head.weight.scale(scale=config.init_scale))
+                self.head.bias.set_value(self.head.bias.scale(scale=config.init_scale))
+        self.patch_dropout = PatchDropout(config.patch_dropout) if config.patch_dropout > 0.0 else paddle.nn.Identity()
 
     def fix_init_weight(self):
         def rescale(param, layer_id):
@@ -762,8 +730,7 @@ def get_cast_dtype(self) -> paddle.dtype:
     def _init_weights(self, m):
         zeros_params = paddle.nn.initializer.Constant(0.0)
         ones_params = paddle.nn.initializer.Constant(1.0)
-        if isinstance(m, (paddle.nn.Linear,
-                          fleet.meta_parallel.ColumnParallelLinear)):
+        if isinstance(m, (paddle.nn.Linear, fleet.meta_parallel.ColumnParallelLinear)):
             trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
                 zeros_params(m.bias)
@@ -775,8 +742,7 @@ def get_num_layers(self):
         return len(self.blocks)
 
     def lock(self, unlocked_groups=0, freeze_bn_stats=False):
-        assert (unlocked_groups == 0
-                ), "partial locking not currently supported for this model"
+        assert unlocked_groups == 0, "partial locking not currently supported for this model"
         for param in self.parameters():
             param.stop_gradient = not False
 
@@ -792,16 +758,19 @@ def get_classifier(self):
     def reset_classifier(self, num_classes, global_pool=""):
         self.num_classes = num_classes
         if dist.get_world_size() > 1:
-            self.head = (fleet.meta_parallel.ColumnParallelLinear(
-                self.embed_dim,
-                num_classes,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True, )
-                         if num_classes > 0 else paddle.nn.Identity())
+            self.head = (
+                fleet.meta_parallel.ColumnParallelLinear(
+                    self.embed_dim,
+                    num_classes,
+                    weight_attr=None,
+                    has_bias=True,
+                    gather_output=True,
+                )
+                if num_classes > 0
+                else paddle.nn.Identity()
+            )
         else:
-            self.head = (paddle.nn.Linear(self.embed_dim, num_classes)
-                         if num_classes > 0 else paddle.nn.Identity())
+            self.head = paddle.nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else paddle.nn.Identity()
 
     def forward_features(self, x, return_all_features=False):
         x = self.patch_embed(x)
@@ -813,25 +782,20 @@ def forward_features(self, x, return_all_features=False):
         with get_rng_state_tracker().rng_state("global_seed"):
             x = self.pos_drop(x)
         if os.getenv("RoPE") == "1":
-            if self.training and not isinstance(self.patch_dropout,
-                                                paddle.nn.Identity):
+            if self.training and not isinstance(self.patch_dropout, paddle.nn.Identity):
                 x, patch_indices_keep = self.patch_dropout(x)
-                self.rope.forward = partial(
-                    self.rope.forward, patch_indices_keep=patch_indices_keep)
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep)
             else:
-                self.rope.forward = partial(
-                    self.rope.forward, patch_indices_keep=None)
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
                 x = self.patch_dropout(x)
         else:
             x = self.patch_dropout(x)
-        rel_pos_bias = self.rel_pos_bias(
-        ) if self.rel_pos_bias is not None else None
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
         cnt = 0
         for blk in self.blocks:
             cnt += 1
             if self.enable_recompute:
-                x = paddle.distributed.fleet.utils.recompute(
-                    blk, x, rel_pos_bias, use_reentrant=False)
+                x = paddle.distributed.fleet.utils.recompute(blk, x, rel_pos_bias, use_reentrant=False)
             else:
                 x = blk(x, rel_pos_bias=rel_pos_bias)
 
diff --git a/paddlemix/models/evaclip/loss.py b/paddlemix/models/evaclip/loss.py
index 89901ae072014..0181da0b724cb 100644
--- a/paddlemix/models/evaclip/loss.py
+++ b/paddlemix/models/evaclip/loss.py
@@ -20,10 +20,7 @@
 from paddlemix.models.common.distributed_utils import allgather
 
 
-def gather_features_cat_group_bk(image_features,
-                                 text_features,
-                                 group,
-                                 gather_with_grad=False):
+def gather_features_cat_group_bk(image_features, text_features, group, gather_with_grad=False):
     if group.world_size <= 1:
         return image_features, text_features
     features = paddle.concat([image_features, text_features], axis=-1)
@@ -37,10 +34,7 @@ def gather_features_cat_group_bk(image_features,
     return image_features, text_features
 
 
-def gather_features_cat_group(image_features,
-                              text_features,
-                              group,
-                              gather_with_grad=False):
+def gather_features_cat_group(image_features, text_features, group, gather_with_grad=False):
     if group.world_size <= 1:
         return image_features, text_features
     if gather_with_grad:
@@ -57,34 +51,35 @@ def gather_features_cat_group(image_features,
 
 
 def gather_features(
-        image_features,
-        text_features,
-        local_loss=False,
-        gather_with_grad=False,
-        rank=0,
-        world_size=1,
-        use_horovod=False, ):
+    image_features,
+    text_features,
+    local_loss=False,
+    gather_with_grad=False,
+    rank=0,
+    world_size=1,
+    use_horovod=False,
+):
     hcg = paddle.distributed.fleet.get_hybrid_communicate_group()
     shardinggroup = hcg.get_sharding_parallel_group()
     dpgroup = hcg.get_data_parallel_group()
     if gather_with_grad:
         if shardinggroup.nranks > 1:
             image_features, text_features = gather_features_cat_group(
-                image_features, text_features, shardinggroup, gather_with_grad)
+                image_features, text_features, shardinggroup, gather_with_grad
+            )
         if dpgroup.nranks > 1:
             image_features, text_features = gather_features_cat_group(
-                image_features, text_features, dpgroup, gather_with_grad)
+                image_features, text_features, dpgroup, gather_with_grad
+            )
         all_image_features = image_features
         all_text_features = text_features
     else:
         image_features_bk = image_features
         text_features_bk = text_features
         if shardinggroup.nranks > 1:
-            image_features, text_features = gather_features_cat_group(
-                image_features, text_features, shardinggroup)
+            image_features, text_features = gather_features_cat_group(image_features, text_features, shardinggroup)
         if dpgroup.nranks > 1:
-            image_features, text_features = gather_features_cat_group(
-                image_features, text_features, dpgroup)
+            image_features, text_features = gather_features_cat_group(image_features, text_features, dpgroup)
         if not local_loss:
             dp_rank = hcg.get_data_parallel_rank()
             sharding_rank = hcg.get_sharding_parallel_rank()
@@ -104,13 +99,14 @@ def gather_features(
 
 
 def gather_features_bk(
-        image_features,
-        text_features,
-        local_loss=False,
-        gather_with_grad=False,
-        rank=0,
-        world_size=1,
-        use_horovod=False, ):
+    image_features,
+    text_features,
+    local_loss=False,
+    gather_with_grad=False,
+    rank=0,
+    world_size=1,
+    use_horovod=False,
+):
 
     # We gather tensors from all gpus
     if gather_with_grad:
@@ -137,14 +133,15 @@ def gather_features_bk(
 
 class ClipLoss(nn.Layer):
     def __init__(
-            self,
-            local_loss=False,
-            gather_with_grad=False,
-            cache_labels=False,
-            visual_loss=True,
-            text_loss=False,
-            rank=0,
-            world_size=1, ):
+        self,
+        local_loss=False,
+        gather_with_grad=False,
+        cache_labels=False,
+        visual_loss=True,
+        text_loss=False,
+        rank=0,
+        world_size=1,
+    ):
         super().__init__()
         self.local_loss = local_loss
         self.gather_with_grad = gather_with_grad
@@ -163,18 +160,18 @@ def forward(self, preds):
                 self.local_loss,
                 self.gather_with_grad,
                 self.rank,
-                self.world_size, )
+                self.world_size,
+            )
 
             if self.local_loss:
-                logits_per_image = logit_scale * image_features @all_text_features.T
-                logits_per_text = logit_scale * text_features @all_image_features.T
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
             else:
-                logits_per_image = (logit_scale * all_image_features
-                                    @all_text_features.T)
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
                 logits_per_text = logits_per_image.T
         else:
-            logits_per_image = logit_scale * image_features @text_features.T
-            logits_per_text = logit_scale * text_features @image_features.T
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
 
         # calculated ground-truth and cache if enabled
         num_logits = logits_per_image.shape[0]
diff --git a/paddlemix/models/evaclip/modules/fusedln.py b/paddlemix/models/evaclip/modules/fusedln.py
index 3e2df13a6148e..beb68a540da7b 100644
--- a/paddlemix/models/evaclip/modules/fusedln.py
+++ b/paddlemix/models/evaclip/modules/fusedln.py
@@ -55,17 +55,19 @@ def check_normalized_shape(normalized_shape):
 
 class FusedLayerNorm(OriginLayerNorm):
     def __init__(
-            self,
-            normalized_shape,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None,
-            name=None, ):
+        self,
+        normalized_shape,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super().__init__(
             normalized_shape=normalized_shape,
             epsilon=epsilon,
             weight_attr=weight_attr,
-            bias_attr=bias_attr, )
+            bias_attr=bias_attr,
+        )
         check_normalized_shape(self._normalized_shape)
 
     def forward(self, input):
@@ -74,17 +76,19 @@ def forward(self, input):
 
 class FastLayerNorm(OriginLayerNorm):
     def __init__(
-            self,
-            normalized_shape,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None,
-            name=None, ):
+        self,
+        normalized_shape,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super().__init__(
             normalized_shape=normalized_shape,
             epsilon=epsilon,
             weight_attr=weight_attr,
-            bias_attr=bias_attr, )
+            bias_attr=bias_attr,
+        )
         check_normalized_shape(self._normalized_shape)
 
     def forward(self, input):
@@ -105,21 +109,19 @@ def backward(ctx, y_grad):
 
         if bias is None:
             if hasattr(weight, "main_grad"):
-                weight.main_grad, _ = _C_ops.fused_linear_param_grad_add(
-                    x, y_grad, weight.main_grad, None, True)
+                weight.main_grad, _ = _C_ops.fused_linear_param_grad_add(x, y_grad, weight.main_grad, None, True)
                 return x_grad, None
             else:
-                weight_grad, _ = _C_ops.fused_linear_param_grad_add(
-                    x, y_grad, None, None, False)
+                weight_grad, _ = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False)
                 return x_grad, weight_grad
 
         if hasattr(weight, "main_grad") and hasattr(bias, "main_grad"):
             weight.main_grad, bias.main_grad = _C_ops.fused_linear_param_grad_add(
-                x, y_grad, weight.main_grad, bias.main_grad, True)
+                x, y_grad, weight.main_grad, bias.main_grad, True
+            )
             return x_grad, None, None
         else:
-            weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(
-                x, y_grad, None, None, False)
+            weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False)
             return x_grad, weight_grad, bias_grad
 
 
diff --git a/paddlemix/models/evaclip/modules/rope.py b/paddlemix/models/evaclip/modules/rope.py
index 130f517ad4299..adaaf2a9872f7 100644
--- a/paddlemix/models/evaclip/modules/rope.py
+++ b/paddlemix/models/evaclip/modules/rope.py
@@ -21,21 +21,19 @@
 def broadcat(tensors, dim=-1):
     num_tensors = len(tensors)
     shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
-    assert len(
-        shape_lens) == 1, "tensors must all have the same number of dimensions"
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
     shape_len = list(shape_lens)[0]
     dim = dim + shape_len if dim < 0 else dim
     dims = list(zip(*map(lambda t: list(t.shape), tensors)))
     expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
-    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)
-                ]), "invalid dimensions for broadcastable concatentation"
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
     max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
-    expanded_dims = list(
-        map(lambda t: (t[0], (t[1], ) * num_tensors), max_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
     expanded_dims.insert(dim, (dim, dims[dim]))
     expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
-    tensors = list(
-        map(lambda t: t[0].expand(shape=t[1]), zip(tensors, expandable_shapes)))
+    tensors = list(map(lambda t: t[0].expand(shape=t[1]), zip(tensors, expandable_shapes)))
     return paddle.concat(x=tensors, axis=dim)
 
 
@@ -48,25 +46,23 @@ def rotate_half(x):
 
 class VisionRotaryEmbedding(paddle.nn.Layer):
     def __init__(
-            self,
-            dim,
-            pt_seq_len,
-            ft_seq_len=None,
-            custom_freqs=None,
-            freqs_for="lang",
-            theta=10000,
-            max_freq=10,
-            num_freqs=1, ):
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+    ):
         super().__init__()
         if custom_freqs:
             freqs = custom_freqs
         elif freqs_for == "lang":
-            freqs = 1.0 / theta**(paddle.arange(
-                start=0, end=dim,
-                step=2)[:dim // 2].astype(dtype="float32") / dim)
+            freqs = 1.0 / theta ** (paddle.arange(start=0, end=dim, step=2)[: dim // 2].astype(dtype="float32") / dim)
         elif freqs_for == "pixel":
-            freqs = paddle.linspace(
-                start=1.0, stop=max_freq / 2, num=dim // 2) * pi
+            freqs = paddle.linspace(start=1.0, stop=max_freq / 2, num=dim // 2) * pi
         elif freqs_for == "constant":
             freqs = paddle.ones(shape=num_freqs).astype(dtype="float32")
         else:
@@ -92,33 +88,32 @@ def forward(self, t, start_index=0):
         t_left, t, t_right = (
             t[(...), :start_index],
             t[(...), start_index:end_index],
-            t[(...), end_index:], )
+            t[(...), end_index:],
+        )
         t = t * self.freqs_cos + rotate_half(t) * self.freqs_sin
         return paddle.concat(x=(t_left, t, t_right), axis=-1)
 
 
 class VisionRotaryEmbeddingFast(paddle.nn.Layer):
     def __init__(
-            self,
-            dim,
-            pt_seq_len,
-            ft_seq_len=None,
-            custom_freqs=None,
-            freqs_for="lang",
-            theta=10000,
-            max_freq=10,
-            num_freqs=1,
-            patch_dropout=0.0, ):
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        patch_dropout=0.0,
+    ):
         super().__init__()
         if custom_freqs:
             freqs = custom_freqs
         elif freqs_for == "lang":
-            freqs = 1.0 / theta**(paddle.arange(
-                start=0, end=dim,
-                step=2)[:dim // 2].astype(dtype="float32") / dim)
+            freqs = 1.0 / theta ** (paddle.arange(start=0, end=dim, step=2)[: dim // 2].astype(dtype="float32") / dim)
         elif freqs_for == "pixel":
-            freqs = paddle.linspace(
-                start=1.0, stop=max_freq / 2, num=dim // 2) * pi
+            freqs = paddle.linspace(start=1.0, stop=max_freq / 2, num=dim // 2) * pi
         elif freqs_for == "constant":
             freqs = paddle.ones(shape=num_freqs).astype(dtype="float32")
         else:
diff --git a/paddlemix/models/evaclip/utils.py b/paddlemix/models/evaclip/utils.py
index 43da3ab27ee85..afc62e5d79a97 100644
--- a/paddlemix/models/evaclip/utils.py
+++ b/paddlemix/models/evaclip/utils.py
@@ -59,10 +59,7 @@ def parse(x):
 to_ntuple = _ntuple
 
 
-def clip_grad_norm_(parameters,
-                    max_norm,
-                    norm_type,
-                    error_if_nonfinite: bool=False):
+def clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite: bool = False):
     r"""Clips gradient norm of an iterable of parameters.
 
     The norm is computed over all gradients together, as if they were
@@ -90,19 +87,16 @@ def clip_grad_norm_(parameters,
         return paddle.to_tensor([0.0])
     if norm_type == float("inf"):
         norms = [g.detach().abs().max() for g in grads]
-        total_norm = norms[0] if len(norms) == 1 else paddle.max(
-            paddle.stack(norms))
+        total_norm = norms[0] if len(norms) == 1 else paddle.max(paddle.stack(norms))
     else:
-        total_norm = paddle.norm(
-            paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]),
-            norm_type)
-    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
-                                                total_norm.isinf()):
+        total_norm = paddle.norm(paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]), norm_type)
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(), total_norm.isinf()):
         raise RuntimeError(
             f"The total norm of order {norm_type} for gradients from "
             "`parameters` is non-finite, so it cannot be clipped. To disable "
             "this error and scale the gradients by the non-finite norm anyway, "
-            "set `error_if_nonfinite=False`")
+            "set `error_if_nonfinite=False`"
+        )
     clip_coef = max_norm / (total_norm + 1e-6)
     # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
     # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
@@ -111,15 +105,10 @@ def clip_grad_norm_(parameters,
     for g in grads:
         clipg = paddle.multiply(g, clip_coef_clamped)
         g.set_value(clipg)
-    total_norm_clip = paddle.norm(
-        paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]),
-        norm_type)
+    total_norm_clip = paddle.norm(paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]), norm_type)
     return total_norm_clip
 
 
-def clip_grad_norm(model,
-                   max_norm,
-                   norm_type=2.0,
-                   error_if_nonfinite: bool=False):
+def clip_grad_norm(model, max_norm, norm_type=2.0, error_if_nonfinite: bool = False):
     parameters = model.parameters()
     return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite)
diff --git a/paddlemix/models/groundingdino/backbone/backbone.py b/paddlemix/models/groundingdino/backbone/backbone.py
index c2ac019c50caf..a6bc6c4b63e16 100644
--- a/paddlemix/models/groundingdino/backbone/backbone.py
+++ b/paddlemix/models/groundingdino/backbone/backbone.py
@@ -15,12 +15,10 @@
 Backbone modules.
 """
 
-from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import List
 
 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
 
 from .position_encoding import build_position_encoding
 from .swin_transformer import SwinTransformerModel
@@ -59,11 +57,11 @@ def build_backbone(args):
     use_checkpoint = getattr(args, "use_checkpoint", False)
 
     if args.backbone in [
-            "swin_T_224_1k",
-            "swin_B_224_22k",
-            "swin_B_384_22k",
-            "swin_L_224_22k",
-            "swin_L_384_22k",
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
     ]:
         pretrain_img_size = int(args.backbone.split("_")[-2])
         backbone = SwinTransformerModel.from_pretrained(
@@ -71,9 +69,10 @@ def build_backbone(args):
             pretrain_img_size=pretrain_img_size,
             out_indices=tuple(return_interm_indices),
             dilation=False,
-            use_checkpoint=use_checkpoint, )
+            use_checkpoint=use_checkpoint,
+        )
 
-        bb_num_channels = backbone.num_features[4 - len(return_interm_indices):]
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
     else:
         raise NotImplementedError("Unknown backbone {}".format(args.backbone))
 
@@ -83,9 +82,8 @@ def build_backbone(args):
 
     model = Joiner(backbone, position_embedding)
     model.num_channels = bb_num_channels
-    assert isinstance(
-        bb_num_channels,
-        List), "bb_num_channels is expected to be a List but {}".format(
-            type(bb_num_channels))
+    assert isinstance(bb_num_channels, List), "bb_num_channels is expected to be a List but {}".format(
+        type(bb_num_channels)
+    )
 
     return model
diff --git a/paddlemix/models/groundingdino/backbone/position_encoding.py b/paddlemix/models/groundingdino/backbone/position_encoding.py
index 4c4410e6023db..4f7d1af31f124 100644
--- a/paddlemix/models/groundingdino/backbone/position_encoding.py
+++ b/paddlemix/models/groundingdino/backbone/position_encoding.py
@@ -18,8 +18,6 @@
 
 import paddle
 import paddle.nn as nn
-from matplotlib.pyplot import axis
-from paddlenlp.utils.initializer import uniform_
 
 
 class PositionEmbeddingSineHW(nn.Layer):
@@ -29,12 +27,13 @@ class PositionEmbeddingSineHW(nn.Layer):
     """
 
     def __init__(
-            self,
-            num_pos_feats=64,
-            temperatureH=10000,
-            temperatureW=10000,
-            normalize=False,
-            scale=None, ):
+        self,
+        num_pos_feats=64,
+        temperatureH=10000,
+        temperatureW=10000,
+        normalize=False,
+        scale=None,
+    ):
         super().__init__()
         self.num_pos_feats = num_pos_feats
         self.temperatureH = temperatureH
@@ -61,23 +60,15 @@ def forward(self, mask: paddle.Tensor):
             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
         dim_tx = paddle.arange(self.num_pos_feats)
-        dim_tx = self.temperatureW**(
-            2 * (paddle.floor_divide(dim_tx, paddle.to_tensor(2))) /
-            self.num_pos_feats)
+        dim_tx = self.temperatureW ** (2 * (paddle.floor_divide(dim_tx, paddle.to_tensor(2))) / self.num_pos_feats)
         pos_x = x_embed[:, :, :, None] / dim_tx
 
         dim_ty = paddle.arange(self.num_pos_feats)
-        dim_ty = self.temperatureH**(
-            2 * (paddle.floor_divide(dim_ty, paddle.to_tensor(2))) /
-            self.num_pos_feats)
+        dim_ty = self.temperatureH ** (2 * (paddle.floor_divide(dim_ty, paddle.to_tensor(2))) / self.num_pos_feats)
         pos_y = y_embed[:, :, :, None] / dim_ty
 
-        pos_x = paddle.stack(
-            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
-            axis=4).flatten(3)
-        pos_y = paddle.stack(
-            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
-            axis=4).flatten(3)
+        pos_x = paddle.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4).flatten(3)
+        pos_y = paddle.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4).flatten(3)
         pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
 
         return pos
@@ -91,7 +82,8 @@ def build_position_encoding(args):
             N_steps,
             temperatureH=args.pe_temperatureH,
             temperatureW=args.pe_temperatureW,
-            normalize=True, )
+            normalize=True,
+        )
     elif args.position_embedding in ("v3", "learned"):
         position_embedding = PositionEmbeddingLearned(N_steps)
     else:
diff --git a/paddlemix/models/groundingdino/backbone/swin_transformer.py b/paddlemix/models/groundingdino/backbone/swin_transformer.py
index 2102a8bc1fa5f..dda95ec7b9932 100644
--- a/paddlemix/models/groundingdino/backbone/swin_transformer.py
+++ b/paddlemix/models/groundingdino/backbone/swin_transformer.py
@@ -15,20 +15,21 @@
 import os
 from typing import Union
 
-import numpy as np
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.utils import recompute
 from paddle.nn.initializer import Constant
 
+from paddlemix.utils.log import logger
+
 from ..layers import DropPath, to_2tuple
 
 trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
 
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
-from paddlenlp.transformers.model_utils import (PretrainedModel,
-                                                register_base_model)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
 """ swin_transformer model configuration"""
 __all__ = ["SwinTransformerConfig"]
 
@@ -38,28 +39,29 @@ class SwinTransformerConfig(PretrainedConfig):
     model_type = "swintransformer"
 
     def __init__(
-            self,
-            in_chans=3,
-            embed_dim=128,
-            depths=[2, 2, 18, 2],
-            num_heads=[4, 8, 16, 32],
-            window_size=7,
-            pretrain_img_size=224,
-            patch_size=4,
-            mlp_ratio=4.0,
-            qkv_bias=True,
-            qk_scale=None,
-            drop_rate=0.0,
-            attn_drop_rate=0.0,
-            drop_path_rate=0.2,
-            norm_layer=nn.LayerNorm,
-            ape=False,
-            patch_norm=True,
-            out_indices=(0, 1, 2, 3),
-            frozen_stages=-1,
-            dilation=False,
-            use_checkpoint=False,
-            **kwargs, ):
+        self,
+        in_chans=3,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        pretrain_img_size=224,
+        patch_size=4,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        dilation=False,
+        use_checkpoint=False,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -85,14 +87,10 @@ def __init__(
         self.use_checkpoint = use_checkpoint
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
-
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -116,12 +114,13 @@ class Mlp(nn.Layer):
     """Multilayer perceptron."""
 
     def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            drop=0.0, ):
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
@@ -148,10 +147,8 @@ def window_partition(x, window_size):
         windows: (num_windows*B, window_size, window_size, C)
     """
     B, H, W, C = x.shape
-    x = x.reshape(
-        [B, H // window_size, window_size, W // window_size, window_size, C])
-    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
-        [-1, window_size, window_size, C])
+    x = x.reshape([B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C])
     return windows
 
 
@@ -166,8 +163,7 @@ def window_reverse(windows, window_size, H, W):
         x: (B, H, W, C)
     """
     B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.reshape(
-        [B, H // window_size, W // window_size, window_size, window_size, -1])
+    x = windows.reshape([B, H // window_size, W // window_size, window_size, window_size, -1])
     x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
     return x
 
@@ -186,14 +182,15 @@ class WindowAttention(nn.Layer):
     """
 
     def __init__(
-            self,
-            dim,
-            window_size,
-            num_heads,
-            qkv_bias=True,
-            qk_scale=None,
-            attn_drop=0.0,
-            proj_drop=0.0, ):
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
 
         super().__init__()
         self.dim = dim
@@ -204,24 +201,19 @@ def __init__(
 
         # define a parameter table of relative position bias
         self.relative_position_bias_table = self.create_parameter(
-            shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1),
-                   num_heads],
+            shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads],
             dtype=paddle.float32,
-            default_initializer=Constant(0.0), )  # 2*Wh-1 * 2*Ww-1, nH
+            default_initializer=Constant(0.0),
+        )  # 2*Wh-1 * 2*Ww-1, nH
 
         # get pair-wise relative position index for each token inside the window
         coords_h = paddle.arange(self.window_size[0])
         coords_w = paddle.arange(self.window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
         coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = (
-            coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        )  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.transpose(
-            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[
-            0] - 1  # shift to start from 0
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
         relative_coords[:, :, 1] += self.window_size[1] - 1
         relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
         self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
@@ -241,34 +233,32 @@ def forward(self, x, mask=None):
             mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
         """
         B_, N, C = x.shape
-        qkv = (self.qkv(x)
-               .reshape([B_, N, 3, self.num_heads, C // self.num_heads])
-               .transpose([2, 0, 3, 1, 4]))
+        qkv = self.qkv(x).reshape([B_, N, 3, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
         q, k, v = (
             qkv[0],
             qkv[1],
-            qkv[2], )  # make torchscript happy (cannot use tensor as tuple)
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
 
         q = q * self.scale
         attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
         index = self.relative_position_index.flatten()
 
-        relative_position_bias = paddle.index_select(
-            self.relative_position_bias_table, index)
+        relative_position_bias = paddle.index_select(self.relative_position_bias_table, index)
 
-        relative_position_bias = relative_position_bias.reshape([
-            self.window_size[0] * self.window_size[1],
-            self.window_size[0] * self.window_size[1],
-            -1,
-        ])  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.transpose(
-            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = relative_position_bias.reshape(
+            [
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1,
+            ]
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
         attn = attn + relative_position_bias.unsqueeze(0)
 
         if mask is not None:
             nW = mask.shape[0]
-            attn = attn.reshape(
-                [-1, nW, self.num_heads, N, N]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, nW, self.num_heads, N, N]) + mask.unsqueeze(1).unsqueeze(0)
             attn = attn.reshape([-1, self.num_heads, N, N])
             attn = self.softmax(attn)
         else:
@@ -300,27 +290,27 @@ class SwinTransformerBlock(nn.Layer):
     """
 
     def __init__(
-            self,
-            dim,
-            num_heads,
-            window_size=7,
-            shift_size=0,
-            mlp_ratio=4.0,
-            qkv_bias=True,
-            qk_scale=None,
-            drop=0.0,
-            attn_drop=0.0,
-            drop_path=0.0,
-            act_layer=nn.GELU,
-            norm_layer=nn.LayerNorm, ):
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
         super().__init__()
         self.dim = dim
         self.num_heads = num_heads
         self.window_size = window_size
         self.shift_size = shift_size
         self.mlp_ratio = mlp_ratio
-        assert (0 <= self.shift_size < self.window_size
-                ), "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
 
         self.norm1 = norm_layer(dim)
         self.attn = WindowAttention(
@@ -330,17 +320,18 @@ def __init__(
             qkv_bias=qkv_bias,
             qk_scale=qk_scale,
             attn_drop=attn_drop,
-            proj_drop=drop, )
+            proj_drop=drop,
+        )
 
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
         self.mlp = Mlp(
             in_features=dim,
             hidden_features=mlp_hidden_dim,
             act_layer=act_layer,
-            drop=drop, )
+            drop=drop,
+        )
 
         self.H = None
         self.W = None
@@ -371,36 +362,26 @@ def forward(self, x, mask_matrix):
 
         # cyclic shift
         if self.shift_size > 0:
-            shifted_x = paddle.roll(
-                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            shifted_x = paddle.roll(x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
             attn_mask = mask_matrix
         else:
             shifted_x = x
             attn_mask = None
 
         # partition windows
-        x_windows = window_partition(
-            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.reshape(
-            [-1, self.window_size * self.window_size,
-             C])  # nW*B, window_size*window_size, C
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape([-1, self.window_size * self.window_size, C])  # nW*B, window_size*window_size, C
 
         # W-MSA/SW-MSA
-        attn_windows = self.attn(
-            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
 
         # merge windows
-        attn_windows = attn_windows.reshape(
-            [-1, self.window_size, self.window_size, C])
-        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
-                                   Wp)  # B H' W' C
+        attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
 
         # reverse cyclic shift
         if self.shift_size > 0:
-            x = paddle.roll(
-                shifted_x,
-                shifts=(self.shift_size, self.shift_size),
-                axis=(1, 2))
+            x = paddle.roll(shifted_x, shifts=(self.shift_size, self.shift_size), axis=(1, 2))
         else:
             x = shifted_x
 
@@ -477,20 +458,21 @@ class BasicLayer(nn.Layer):
     """
 
     def __init__(
-            self,
-            dim,
-            depth,
-            num_heads,
-            window_size=7,
-            mlp_ratio=4.0,
-            qkv_bias=True,
-            qk_scale=None,
-            drop=0.0,
-            attn_drop=0.0,
-            drop_path=0.0,
-            norm_layer=nn.LayerNorm,
-            downsample=None,
-            use_checkpoint=False, ):
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
         super().__init__()
         self.window_size = window_size
         self.shift_size = window_size // 2
@@ -498,21 +480,24 @@ def __init__(
         self.use_checkpoint = use_checkpoint
 
         # build blocks
-        self.blocks = nn.LayerList([
-            SwinTransformerBlock(
-                dim=dim,
-                num_heads=num_heads,
-                window_size=window_size,
-                shift_size=0 if (i % 2 == 0) else window_size // 2,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop,
-                attn_drop=attn_drop,
-                drop_path=drop_path[i]
-                if isinstance(drop_path, list) else drop_path,
-                norm_layer=norm_layer, ) for i in range(depth)
-        ])
+        self.blocks = nn.LayerList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
 
         # patch merging layer
         if downsample is not None:
@@ -530,29 +515,27 @@ def forward(self, x, H, W):
         # calculate attention mask for SW-MSA
         Hp = (H + self.window_size - 1) // self.window_size * self.window_size
         Wp = (W + self.window_size - 1) // self.window_size * self.window_size
-        img_mask = paddle.zeros(
-            (1, Hp, Wp, 1), dtype=paddle.float32)  # 1 Hp Wp 1
+        img_mask = paddle.zeros((1, Hp, Wp, 1), dtype=paddle.float32)  # 1 Hp Wp 1
         h_slices = (
             slice(0, -self.window_size),
             slice(-self.window_size, -self.shift_size),
-            slice(-self.shift_size, None), )
+            slice(-self.shift_size, None),
+        )
         w_slices = (
             slice(0, -self.window_size),
             slice(-self.window_size, -self.shift_size),
-            slice(-self.shift_size, None), )
+            slice(-self.shift_size, None),
+        )
         cnt = 0
         for h in h_slices:
             for w in w_slices:
                 img_mask[:, h, w, :] = cnt
                 cnt += 1
 
-        mask_windows = window_partition(
-            img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.reshape(
-            [-1, self.window_size * self.window_size])
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape([-1, self.window_size * self.window_size])
         attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = (-100.0 * paddle.ones_like(attn_mask) *
-                     (attn_mask != 0).astype(paddle.float32))
+        attn_mask = -100.0 * paddle.ones_like(attn_mask) * (attn_mask != 0).astype(paddle.float32)
 
         for blk in self.blocks:
             blk.H, blk.W = H, W
@@ -585,8 +568,7 @@ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
         self.in_chans = in_chans
         self.embed_dim = embed_dim
 
-        self.proj = nn.Conv2D(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
         if norm_layer is not None:
             self.norm = norm_layer(embed_dim)
         else:
@@ -640,32 +622,29 @@ def __init__(self, config: SwinTransformerConfig):
             patch_size=self.patch_size,
             in_chans=self.in_chans,
             embed_dim=self.embed_dim,
-            norm_layer=self.norm_layer if self.patch_norm else None, )
+            norm_layer=self.norm_layer if self.patch_norm else None,
+        )
 
         # absolute position embedding
         if self.ape:
-            patch_size = to_2tuple(self.patch_size)
+            # patch_size = to_2tuple(self.patch_size)
             patches_resolution = [
                 self.pretrain_img_size[0] // self.patch_size[0],
                 self.pretrain_img_size[1] // self.patch_size[1],
             ]
 
             self.absolute_pos_embed = self.create_parameter(
-                shape=[
-                    1, self.embed_dim, patches_resolution[0],
-                    patches_resolution[1]
-                ],
+                shape=[1, self.embed_dim, patches_resolution[0], patches_resolution[1]],
                 dtype=paddle.float32,
-                default_initializer=Constant(0.0), )
+                default_initializer=Constant(0.0),
+            )
             trunc_normal_(self.absolute_pos_embed)
 
         self.pos_drop = nn.Dropout(p=config.drop_rate)
 
         # stochastic depth
         dpr = [
-            x.item()
-            for x in paddle.linspace(0, config.drop_path_rate,
-                                     sum(config.depths))
+            x.item() for x in paddle.linspace(0, config.drop_path_rate, sum(config.depths))
         ]  # stochastic depth decay rule
 
         # build layers
@@ -673,13 +652,10 @@ def __init__(self, config: SwinTransformerConfig):
         # prepare downsample list
         downsamplelist = [PatchMerging for i in range(self.num_layers)]
         downsamplelist[-1] = None
-        num_features = [
-            int(self.embed_dim * 2**i) for i in range(self.num_layers)
-        ]
+        num_features = [int(self.embed_dim * 2**i) for i in range(self.num_layers)]
         if self.dilation:
             downsamplelist[-2] = None
-            num_features[-1] = int(self.embed_dim * 2
-                                   **(self.num_layers - 1)) // 2
+            num_features[-1] = int(self.embed_dim * 2 ** (self.num_layers - 1)) // 2
         for i_layer in range(self.num_layers):
             layer = BasicLayer(
                 dim=num_features[i_layer],
@@ -691,11 +667,11 @@ def __init__(self, config: SwinTransformerConfig):
                 qk_scale=config.qk_scale,
                 drop=config.drop_rate,
                 attn_drop=config.attn_drop_rate,
-                drop_path=dpr[sum(config.depths[:i_layer]):sum(
-                    config.depths[:i_layer + 1])],
+                drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                 norm_layer=self.norm_layer,
                 downsample=downsamplelist[i_layer],
-                use_checkpoint=config.use_checkpoint, )
+                use_checkpoint=config.use_checkpoint,
+            )
             self.layers.append(layer)
 
         self.num_features = num_features
@@ -715,7 +691,7 @@ def _freeze_stages(self):
                 param.stop_gradient = True
 
         if self.frozen_stages >= 1 and self.ape:
-            self.absolute_pos_embed.stop_gradient = Trueƒ
+            self.absolute_pos_embed.stop_gradient = True
 
         if self.frozen_stages >= 2:
             self.pos_drop.eval()
@@ -732,10 +708,8 @@ def forward_raw(self, x):
         Wh, Ww = x.shape[2:4]
         if self.ape:
             # interpolate the position embedding to the corresponding size
-            absolute_pos_embed = F.interpolate(
-                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic")
-            x = (x + absolute_pos_embed).flatten(2).transpose(
-                [0, 2, 1])  # B Wh*Ww C
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic")
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])  # B Wh*Ww C
         else:
             x = x.flatten(2).transpose([0, 2, 1])
         x = self.pos_drop(x)
@@ -749,8 +723,7 @@ def forward_raw(self, x):
                 norm_layer = getattr(self, f"norm{i}")
                 x_out = norm_layer(x_out)
 
-                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
-                    (0, 3, 1, 2))
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2))
                 outs.append(out)
         # in:
         #   torch.Size([2, 3, 1024, 1024])
@@ -766,10 +739,8 @@ def forward_with_mask(self, x: paddle.Tensor, m: paddle.Tensor):
         Wh, Ww = x.shape[2], x.shape[3]
         if self.ape:
             # interpolate the position embedding to the corresponding size
-            absolute_pos_embed = F.interpolate(
-                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic")
-            x = (x + absolute_pos_embed).flatten(2).transpose(
-                [0, 2, 1])  # B Wh*Ww C
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic")
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])  # B Wh*Ww C
         else:
             x = x.flatten(2).transpose([0, 2, 1])
         x = self.pos_drop(x)
@@ -783,17 +754,14 @@ def forward_with_mask(self, x: paddle.Tensor, m: paddle.Tensor):
                 norm_layer = getattr(self, f"norm{i}")
                 x_out = norm_layer(x_out)
 
-                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
-                    (0, 3, 1, 2))
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2))
                 outs.append(out)
 
         feat_dict = []
         mask_dict = []
         for idx, out_i in enumerate(outs):
             assert m is not None
-            mask = F.interpolate(
-                m[None].cast(paddle.float32),
-                size=out_i.shape[-2:]).cast(paddle.bool)[0]
+            mask = F.interpolate(m[None].cast(paddle.float32), size=out_i.shape[-2:]).cast(paddle.bool)[0]
             feat_dict.append(out_i)
             mask_dict.append(mask)
 
diff --git a/paddlemix/models/groundingdino/bert_model.py b/paddlemix/models/groundingdino/bert_model.py
index e661f4621bc86..ca9c89bb8b5b2 100644
--- a/paddlemix/models/groundingdino/bert_model.py
+++ b/paddlemix/models/groundingdino/bert_model.py
@@ -14,15 +14,14 @@
 
 import math
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Tuple
 
 import numpy as np
 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
-from paddlenlp.taskflow.utils import pad_batch_data
-from paddlenlp.transformers.bert.modeling import \
-    BaseModelOutputWithPoolingAndCrossAttentions
+from paddlenlp.transformers.bert.modeling import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
 
 
 class GELUActivation(nn.Layer):
@@ -33,7 +32,7 @@ class GELUActivation(nn.Layer):
     Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
 
-    def __init__(self, use_gelu_python: bool=False):
+    def __init__(self, use_gelu_python: bool = False):
         super().__init__()
         self.act = nn.functional.gelu
 
@@ -42,20 +41,16 @@ def forward(self, input):
 
 
 class BertSelfAttention(nn.Layer):
-    def __init__(self,
-                 config,
-                 clamp_min_for_underflow=False,
-                 clamp_max_for_overflow=False):
+    def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False):
         super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, "embedding_size"):
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})")
+                f"heads ({config.num_attention_heads})"
+            )
 
         self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size /
-                                       config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
@@ -63,8 +58,7 @@ def __init__(self,
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(
-            config, "position_embedding_type", "absolute")  # 'absolute'
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")  # 'absolute'
         self.clamp_min_for_underflow = clamp_min_for_underflow
         self.clamp_max_for_overflow = clamp_max_for_overflow
 
@@ -73,19 +67,21 @@ def __init__(self,
     def transpose_for_scores(self, x):
         new_x_shape = tuple(x.shape[:-1]) + (
             self.num_attention_heads,
-            self.attention_head_size, )
+            self.attention_head_size,
+        )
         x = x.reshape(new_x_shape)
         return x.transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
 
         mixed_query_layer = self.query(hidden_states)
 
@@ -97,17 +93,14 @@ def forward(
             value_layer = past_key_value[1]
             attention_mask = encoder_attention_mask
         elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
-            value_layer = paddle.concat(
-                [past_key_value[1], value_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
         else:  # here
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -118,11 +111,9 @@ def forward(
             past_key_value = (key_layer, value_layer)
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = paddle.matmul(query_layer,
-                                         key_layer.transpose([0, 1, 3, 2]))
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
         # return attention_scores
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 
         if self.clamp_min_for_underflow:
             attention_scores = paddle.clip(attention_scores, min=-50000)
@@ -143,15 +134,13 @@ def forward(
         context_layer = paddle.matmul(attention_probs, value_layer)
 
         context_layer = context_layer.transpose([0, 2, 1, 3])
-        new_context_layer_shape = tuple(context_layer.shape[:-2]) + (
-            self.all_head_size, )
+        new_context_layer_shape = tuple(context_layer.shape[:-2]) + (self.all_head_size,)
         context_layer = context_layer.reshape(new_context_layer_shape)
 
-        outputs = ((context_layer, attention_probs)
-                   if output_attentions else (context_layer, ))
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
         if self.is_decoder:
-            outputs = outputs + (past_key_value, )
+            outputs = outputs + (past_key_value,)
         return outputs
 
 
@@ -159,8 +148,7 @@ class BertSelfOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
@@ -171,25 +159,22 @@ def forward(self, hidden_states, input_tensor):
 
 
 class BertAttention(nn.Layer):
-    def __init__(self,
-                 config,
-                 clamp_min_for_underflow=False,
-                 clamp_max_for_overflow=False):
+    def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False):
         super().__init__()
-        self.self = BertSelfAttention(config, clamp_min_for_underflow,
-                                      clamp_max_for_overflow)
+        self.self = BertSelfAttention(config, clamp_min_for_underflow, clamp_max_for_overflow)
         self.output = BertSelfOutput(config)
         self.pruned_heads = set()
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
         self_outputs = self.self(
             hidden_states,
             attention_mask,
@@ -197,11 +182,11 @@ def forward(
             encoder_hidden_states,
             encoder_attention_mask,
             past_key_value,
-            output_attentions, )  # pass
+            output_attentions,
+        )  # pass
         # return self_outputs
         attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
 
@@ -224,8 +209,7 @@ class BertOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
@@ -240,35 +224,32 @@ class BertEmbeddings(nn.Layer):
 
     def __init__(self, config):
         super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size,
-                                            config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.position_embedding_type = getattr(
-            config, "position_embedding_type", "absolute")
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "position_ids",
-            paddle.arange(config.max_position_embeddings).reshape((1, -1)), )
+            paddle.arange(config.max_position_embeddings).reshape((1, -1)),
+        )
         self.register_buffer(
             "token_type_ids",
-            paddle.zeros(
-                self.position_ids.shape, dtype=paddle.int64),
-            persistable=False, )
+            paddle.zeros(self.position_ids.shape, dtype=paddle.int64),
+            persistable=False,
+        )
 
     def forward(
-            self,
-            input_ids=None,
-            token_type_ids=None,
-            position_ids=None,
-            inputs_embeds=None,
-            past_key_values_length=0, ):
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
         if input_ids is not None:
             input_shape = input_ids.shape
         else:
@@ -277,15 +258,12 @@ def forward(
         seq_length = input_shape[1]
 
         if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length:
-                                             seq_length +
-                                             past_key_values_length]
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    [input_shape[0], seq_length])
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand([input_shape[0], seq_length])
                 token_type_ids = buffered_token_type_ids_expanded
             else:
                 token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
@@ -314,32 +292,30 @@ def __init__(self, config):
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             if not self.is_decoder:
-                raise ValueError(
-                    f"{self} should be used as a decoder model if cross attention is added"
-                )
-            self.crossattention = BertAttention(
-                config, position_embedding_type="absolute")
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute")
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
-
-        self_attn_past_key_value = (past_key_value[:2]
-                                    if past_key_value is not None else None)
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         self_attention_outputs = self.attention(
             hidden_states,
             attention_mask,
             head_mask,
             output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value, )
+            past_key_value=self_attn_past_key_value,
+        )
         # return self_attention_outputs
         attention_output = self_attention_outputs[0]
         # if decoder, the last output is tuple of self-attn cache
@@ -347,18 +323,17 @@ def forward(
             outputs = self_attention_outputs[1:-1]
             present_key_value = self_attention_outputs[-1]
         else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         cross_attn_present_key_value = None
         if self.is_decoder and encoder_hidden_states is not None:
             if not hasattr(self, "crossattention"):
                 raise ValueError(
                     f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
-                    " by setting `config.add_cross_attention=True`")
+                    " by setting `config.add_cross_attention=True`"
+                )
 
-            cross_attn_past_key_value = (past_key_value[-2:] if
-                                         past_key_value is not None else None)
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
             cross_attention_outputs = self.crossattention(
                 attention_output,
                 attention_mask,
@@ -366,7 +341,8 @@ def forward(
                 encoder_hidden_states,
                 encoder_attention_mask,
                 cross_attn_past_key_value,
-                output_attentions, )
+                output_attentions,
+            )
             attention_output = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:-1]
 
@@ -375,10 +351,10 @@ def forward(
 
         layer_output = self.feed_forward_chunk(attention_output)
 
-        outputs = (layer_output, ) + outputs
+        outputs = (layer_output,) + outputs
 
         if self.is_decoder:
-            outputs = outputs + (present_key_value, )
+            outputs = outputs + (present_key_value,)
 
         return outputs
 
@@ -392,35 +368,33 @@ class BertEncoder(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.LayerList(
-            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.LayerList([BertLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (() if output_attentions and
-                                self.config.add_cross_attention else None)
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
         next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
 
             layer_outputs = layer_module(
                 hidden_states,
@@ -429,35 +403,39 @@ def forward(
                 encoder_hidden_states,
                 encoder_attention_mask,
                 past_key_value,
-                output_attentions, )
+                output_attentions,
+            )
             # return layer_outputs
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
+                next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1], )
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                 if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v
-                         for v in [
-                             hidden_states,
-                             next_decoder_cache,
-                             all_hidden_states,
-                             all_self_attentions,
-                             all_cross_attentions,
-                         ] if v is not None)
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=hidden_states,
             pooler_output=None,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
-            attentions=all_self_attentions, )
+            attentions=all_self_attentions,
+        )
 
 
 class BertPooler(nn.Layer):
@@ -510,11 +488,12 @@ class PreTrainedModel
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def get_extended_attention_mask(
-            self,
-            attention_mask: paddle.Tensor,
-            input_shape: Tuple[int],
-            device: str=None,
-            dtype: np.float=None, ) -> paddle.Tensor:
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        device: str = None,
+        dtype: np.float = None,
+    ) -> paddle.Tensor:
         if dtype is None:
             dtype = np.float32
 
@@ -523,7 +502,8 @@ def get_extended_attention_mask(
             if device is not None:
                 warnings.warn(
                     "The `device` argument is deprecated and will be removed in v5 of Transformers.",
-                    FutureWarning, )
+                    FutureWarning,
+                )
 
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
@@ -535,34 +515,30 @@ def get_extended_attention_mask(
                 f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
             )
 
-        extended_attention_mask = paddle.cast(
-            extended_attention_mask, dtype=dtype)  # fp16 compatibility
-        extended_attention_mask = (
-            1.0 - extended_attention_mask) * np.finfo(dtype).min
+        extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * np.finfo(dtype).min
         return extended_attention_mask
 
-    def get_head_mask(self,
-                      head_mask,
-                      num_hidden_layers,
-                      is_attention_chunked=False):
+    def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False):
         head_mask = [None] * num_hidden_layers
         return head_mask
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None, ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -583,13 +559,11 @@ def forward(
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -597,47 +571,38 @@ def forward(
             use_cache = False
 
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.shape
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.shape[:-1]
         else:
-            raise ValueError(
-                "You have to specify either input_ids or inputs_embeds")
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         batch_size, seq_length = input_shape
 
         # past_key_values_length
-        past_key_values_length = (past_key_values[0][0].shape[2]
-                                  if past_key_values is not None else 0)
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if attention_mask is None:
-            attention_mask = paddle.ones((
-                (batch_size, seq_length + past_key_values_length)))
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
 
         if token_type_ids is None:
             if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    [batch_size, seq_length])
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand([batch_size, seq_length])
                 token_type_ids = buffered_token_type_ids_expanded
             else:
                 token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
 
-        extended_attention_mask = self.get_extended_attention_mask(
-            attention_mask, input_shape)
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
 
         if self.config.is_decoder and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(encoder_hidden_shape)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
 
@@ -648,7 +613,8 @@ def forward(
             position_ids=position_ids,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length, )
+            past_key_values_length=past_key_values_length,
+        )
         # return embedding_output
         encoder_outputs = self.encoder(
             embedding_output,
@@ -660,11 +626,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         # return encoder_outputs
         sequence_output = encoder_outputs[0]
-        pooled_output = (self.pooler(sequence_output)
-                         if self.pooler is not None else None)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
@@ -675,7 +641,8 @@ def forward(
             past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions, )
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
 
 
 class language_model(nn.Layer):
@@ -685,9 +652,9 @@ def __init__(self, cfg, bert_config):
         self.bert_name = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE
         print(
             "LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ",
-            self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT, )
-        bert_config.gradient_checkpointing = (
-            self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT)
+            self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT,
+        )
+        bert_config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT
 
         self.model = BertModel(bert_config)
         self.language_dim = 768
@@ -702,20 +669,18 @@ def forward(self, x):
             outputs = self.model(
                 input_ids=input,
                 attention_mask=mask,
-                output_hidden_states=True, )
+                output_hidden_states=True,
+            )
             # outputs has 13 layers, 1 input layer and 12 hidden layers
             encoded_layers = outputs.hidden_states[1:]
             features = None
-            features = paddle.stack(encoded_layers[-self.num_layers:],
-                                    1).mean(1)
+            features = paddle.stack(encoded_layers[-self.num_layers :], 1).mean(1)
 
             # language embedding has shape [len(phrase), seq_len, language_dim]
             features = features / self.num_layers
 
-            embedded = paddle.cast(features * mask.unsqueeze(-1),
-                                   paddle.float32)
-            aggregate = embedded.sum(1) / (
-                paddle.cast(mask.sum(-1).unsqueeze(-1), paddle.float32))
+            embedded = paddle.cast(features * mask.unsqueeze(-1), paddle.float32)
+            aggregate = embedded.sum(1) / (paddle.cast(mask.sum(-1).unsqueeze(-1), paddle.float32))
 
         ret = {
             "aggregate": aggregate,
diff --git a/paddlemix/models/groundingdino/bertwarper.py b/paddlemix/models/groundingdino/bertwarper.py
index ad618d8cfd9b0..abc4f4d02a286 100644
--- a/paddlemix/models/groundingdino/bertwarper.py
+++ b/paddlemix/models/groundingdino/bertwarper.py
@@ -14,9 +14,9 @@
 
 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
-from paddlenlp.transformers.model_outputs import \
-    BaseModelOutputWithPoolingAndCrossAttentions
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
 
 from .bert_model import BertModel
 
@@ -37,20 +37,21 @@ def __init__(self, bert_model):
         self.use_return_dict = True
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None, ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
         r"""
         encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -71,11 +72,10 @@ def forward(
             If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
             decoding (see :obj:`past_key_values`).
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.use_return_dict
 
         if self.config.is_decoder:
@@ -84,9 +84,7 @@ def forward(
             use_cache = False
 
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.shape
             batch_size, seq_length = input_shape
@@ -94,23 +92,19 @@ def forward(
             input_shape = inputs_embeds.shape[:-1]
             batch_size, seq_length = input_shape
         else:
-            raise ValueError(
-                "You have to specify either input_ids or inputs_embeds")
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         # past_key_values_length
-        past_key_values_length = (past_key_values[0][0].shape[2]
-                                  if past_key_values is not None else 0)
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if attention_mask is None:
-            attention_mask = paddle.ones((
-                (batch_size, seq_length + past_key_values_length)))
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
         if token_type_ids is None:
             token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape)
+        extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
 
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
@@ -138,7 +132,8 @@ def forward(
             position_ids=position_ids,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length, )
+            past_key_values_length=past_key_values_length,
+        )
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -150,10 +145,10 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         sequence_output = encoder_outputs[0]
-        pooled_output = (self.pooler(sequence_output)
-                         if self.pooler is not None else None)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
@@ -164,7 +159,8 @@ def forward(
             past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions, )
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
 
 
 class TextEncoderShell(nn.Layer):
@@ -178,8 +174,7 @@ def forward(self, **kw):
         return self.text_encoder(**kw)
 
 
-def generate_masks_with_special_tokens(tokenized, special_tokens_list,
-                                       tokenizer):
+def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
     """Generate attention mask between each pair of special tokens
     Args:
         input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
@@ -198,8 +193,7 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list,
     idxs = paddle.nonzero(special_tokens_mask)
 
     # generate attention mask and positional ids
-    attention_mask = (
-        paddle.eye(num_token, dtype=paddle.bool).unsqueeze(0).tile([bs, 1, 1]))
+    attention_mask = paddle.eye(num_token, dtype=paddle.bool).unsqueeze(0).tile([bs, 1, 1])
     position_ids = paddle.zeros((bs, num_token))
     previous_col = 0
     for i in range(idxs.shape[0]):
@@ -208,10 +202,8 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list,
             attention_mask[row, col, col] = True
             position_ids[row, col] = 0
         else:
-            attention_mask[row, previous_col + 1:col + 1, previous_col + 1:col +
-                           1] = True
-            position_ids[row, previous_col + 1:col + 1] = paddle.arange(
-                0, col - previous_col)
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
 
         previous_col = col
 
@@ -222,8 +214,7 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list,
     return attention_mask, position_ids.cast(paddle.int64)
 
 
-def generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, special_tokens_list, tokenizer):
+def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
     """Generate attention mask between each pair of special tokens
     Args:
         input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
@@ -242,8 +233,7 @@ def generate_masks_with_special_tokens_and_transfer_map(
     idxs = paddle.nonzero(special_tokens_mask)
 
     # generate attention mask and positional ids
-    attention_mask = (paddle.eye(num_token, dtype=paddle.int32)
-                      .cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1]))
+    attention_mask = paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])
     position_ids = paddle.zeros((bs, num_token))
     cate_to_token_mask_list = [[] for _ in range(bs)]
     previous_col = 0
@@ -253,12 +243,14 @@ def generate_masks_with_special_tokens_and_transfer_map(
             attention_mask[row, col, col] = True
             position_ids[row, col] = 0
         else:
-            attention_mask[row, previous_col + 1:col + 1, previous_col + 1:col +
-                           1] = True
-            position_ids[row, previous_col + 1:col + 1] = paddle.arange(
-                0, col - previous_col)
-            c2t_maski = paddle.zeros([num_token, ]).cast(paddle.bool)
-            c2t_maski[previous_col + 1:col] = True
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
+            c2t_maski = paddle.zeros(
+                [
+                    num_token,
+                ]
+            ).cast(paddle.bool)
+            c2t_maski[previous_col + 1 : col] = True
             cate_to_token_mask_list[row].append(c2t_maski)
         previous_col = col
 
@@ -271,5 +263,4 @@ def generate_masks_with_special_tokens_and_transfer_map(
     # padding_mask = tokenized['attention_mask']
     # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
 
-    return attention_mask, position_ids.cast(
-        paddle.int64), cate_to_token_mask_list
+    return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
diff --git a/paddlemix/models/groundingdino/configuration.py b/paddlemix/models/groundingdino/configuration.py
index 853217aa9daa3..da41db3e25490 100644
--- a/paddlemix/models/groundingdino/configuration.py
+++ b/paddlemix/models/groundingdino/configuration.py
@@ -17,6 +17,8 @@
 
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 
+from paddlemix.utils.log import logger
+
 __all__ = ["GroundingDinoConfig"]
 
 
@@ -25,50 +27,51 @@ class GroundingDinoConfig(PretrainedConfig):
     model_type = "groundingdino"
 
     def __init__(
-            self,
-            modelname="groundingdino",
-            backbone="swin_T_224_1k",
-            position_embedding="sine",
-            pe_temperatureH=20,
-            pe_temperatureW=20,
-            return_interm_indices=[1, 2, 3],
-            backbone_freeze_keywords=None,
-            enc_layers=6,
-            dec_layers=6,
-            pre_norm=False,
-            dim_feedforward=2048,
-            hidden_dim=256,
-            dropout=0.0,
-            nheads=8,
-            num_queries=900,
-            query_dim=4,
-            num_patterns=0,
-            num_feature_levels=4,
-            enc_n_points=4,
-            dec_n_points=4,
-            two_stage_type="standard",
-            two_stage_bbox_embed_share=False,
-            two_stage_class_embed_share=False,
-            transformer_activation="relu",
-            dec_pred_bbox_embed_share=True,
-            dn_box_noise_scale=1.0,
-            dn_label_noise_ratio=0.5,
-            dn_label_coef=1.0,
-            dn_bbox_coef=1.0,
-            embed_init_tgt=True,
-            dn_labelbook_size=2000,
-            max_text_len=256,
-            text_encoder_type="bert-base-uncased",
-            use_text_enhancer=True,
-            use_fusion_layer=True,
-            use_checkpoint=False,
-            use_transformer_ckpt=False,
-            use_text_cross_attention=True,
-            text_dropout=0.0,
-            fusion_dropout=0.0,
-            fusion_droppath=0.1,
-            sub_sentence_present=True,
-            **kwargs, ):
+        self,
+        modelname="groundingdino",
+        backbone="swin_T_224_1k",
+        position_embedding="sine",
+        pe_temperatureH=20,
+        pe_temperatureW=20,
+        return_interm_indices=[1, 2, 3],
+        backbone_freeze_keywords=None,
+        enc_layers=6,
+        dec_layers=6,
+        pre_norm=False,
+        dim_feedforward=2048,
+        hidden_dim=256,
+        dropout=0.0,
+        nheads=8,
+        num_queries=900,
+        query_dim=4,
+        num_patterns=0,
+        num_feature_levels=4,
+        enc_n_points=4,
+        dec_n_points=4,
+        two_stage_type="standard",
+        two_stage_bbox_embed_share=False,
+        two_stage_class_embed_share=False,
+        transformer_activation="relu",
+        dec_pred_bbox_embed_share=True,
+        dn_box_noise_scale=1.0,
+        dn_label_noise_ratio=0.5,
+        dn_label_coef=1.0,
+        dn_bbox_coef=1.0,
+        embed_init_tgt=True,
+        dn_labelbook_size=2000,
+        max_text_len=256,
+        text_encoder_type="bert-base-uncased",
+        use_text_enhancer=True,
+        use_fusion_layer=True,
+        use_checkpoint=False,
+        use_transformer_ckpt=False,
+        use_text_cross_attention=True,
+        text_dropout=0.0,
+        fusion_dropout=0.0,
+        fusion_droppath=0.1,
+        sub_sentence_present=True,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
         self.modelname = modelname
@@ -115,14 +118,10 @@ def __init__(
         self.sub_sentence_present = sub_sentence_present
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
diff --git a/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py b/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py
index 869e9e8314fa9..dd45756efcf85 100644
--- a/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py
+++ b/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py
@@ -17,5 +17,5 @@
 if __name__ == "__main__":
     setup(
         name="deformable_detr_ops",
-        ext_modules=CUDAExtension(
-            sources=["ms_deformable_attn_op.cc", "ms_deformable_attn_op.cu"]), )
+        ext_modules=CUDAExtension(sources=["ms_deformable_attn_op.cc", "ms_deformable_attn_op.cu"]),
+    )
diff --git a/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py b/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py
index f6e4818963d64..3e7739510df32 100644
--- a/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py
+++ b/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py
@@ -50,22 +50,15 @@
 bs, n_heads, c = 2, 8, 8
 query_length, n_levels, n_points = 2, 2, 2
 spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
-level_start_index = paddle.concat((paddle.to_tensor(
-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+level_start_index = paddle.concat((paddle.to_tensor([0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
 value_length = sum([(H * W).item() for H, W in spatial_shapes])
 
 
 def get_test_tensors(channels):
-    value = (paddle.rand(
-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01)
-    sampling_locations = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points, 2],
-        dtype=paddle.float32)
-    attention_weights = (paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points], dtype=paddle.float32) +
-                         1e-5)
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-        -2, keepdim=True)
+    value = paddle.rand([bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand([bs, query_length, n_heads, n_levels, n_points, 2], dtype=paddle.float32)
+    attention_weights = paddle.rand([bs, query_length, n_heads, n_levels, n_points], dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
 
     return [value, sampling_locations, attention_weights]
 
@@ -74,23 +67,31 @@ def get_test_tensors(channels):
 def check_forward_equal_with_paddle_float():
     value, sampling_locations, attention_weights = get_test_tensors(c)
 
-    output_paddle = (ms_deform_attn_core_paddle(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_locations,
-        attention_weights, ).detach().cpu())
-    output_cuda = (ms_deformable_attn(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_locations,
-        attention_weights, ).detach().cpu())
-    fwdok = paddle.allclose(
-        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
+    output_paddle = (
+        ms_deform_attn_core_paddle(
+            value,
+            spatial_shapes,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+        .detach()
+        .cpu()
+    )
+    output_cuda = (
+        ms_deformable_attn(
+            value,
+            spatial_shapes,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+        .detach()
+        .cpu()
+    )
+    fwdok = paddle.allclose(output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
     max_abs_err = (output_cuda - output_paddle).abs().max().item()
-    max_rel_err = ((
-        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item())
+    max_rel_err = ((output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
 
     print(
         f"*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}"
@@ -101,7 +102,8 @@ def check_gradient_numerical(channels=4):
     (
         value_paddle,
         sampling_locations_paddle,
-        attention_weights_paddle, ) = get_test_tensors(channels)
+        attention_weights_paddle,
+    ) = get_test_tensors(channels)
     value_paddle.stop_gradient = False
     sampling_locations_paddle.stop_gradient = False
     attention_weights_paddle.stop_gradient = False
@@ -118,7 +120,8 @@ def check_gradient_numerical(channels=4):
         spatial_shapes,
         level_start_index,
         sampling_locations_paddle,
-        attention_weights_paddle, )
+        attention_weights_paddle,
+    )
     output_paddle.sum().backward()
 
     output_cuda = ms_deformable_attn(
@@ -126,25 +129,22 @@ def check_gradient_numerical(channels=4):
         spatial_shapes,
         level_start_index,
         sampling_locations_cuda,
-        attention_weights_cuda, )
+        attention_weights_cuda,
+    )
     output_cuda.sum().backward()
 
-    res = paddle.allclose(
-        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
+    res = paddle.allclose(value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
     print(f"*tensor1 {res} check_gradient_numerical(D={channels})")
 
     res = paddle.allclose(
         sampling_locations_paddle.grad,
         sampling_locations_cuda.grad,
         rtol=1e-2,
-        atol=1e-3, ).item()
+        atol=1e-3,
+    ).item()
     print(f"*tensor2 {res} check_gradient_numerical(D={channels})")
 
-    res = paddle.allclose(
-        attention_weights_paddle.grad,
-        attention_weights_cuda.grad,
-        rtol=1e-2,
-        atol=1e-3).item()
+    res = paddle.allclose(attention_weights_paddle.grad, attention_weights_cuda.grad, rtol=1e-2, atol=1e-3).item()
     print(f"*tensor3 {res} check_gradient_numerical(D={channels})")
 
 
diff --git a/paddlemix/models/groundingdino/fuse_modules.py b/paddlemix/models/groundingdino/fuse_modules.py
index f395f060c7f94..7940a81a8fcdf 100644
--- a/paddlemix/models/groundingdino/fuse_modules.py
+++ b/paddlemix/models/groundingdino/fuse_modules.py
@@ -58,11 +58,7 @@ def l2norm(X, dim, eps=1e-8):
     return X
 
 
-def func_attention(query,
-                   context,
-                   smooth=1,
-                   raw_feature_norm="softmax",
-                   eps=1e-8):
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
     """
     query: (n_context, queryL, d)
     context: (n_context, sourceL, d)
@@ -112,13 +108,7 @@ def func_attention(query,
 
 
 class BiMultiHeadAttention(nn.Layer):
-    def __init__(self,
-                 v_dim,
-                 l_dim,
-                 embed_dim,
-                 num_heads,
-                 dropout=0.1,
-                 cfg=None):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
         super(BiMultiHeadAttention, self).__init__()
 
         self.embed_dim = embed_dim
@@ -130,7 +120,7 @@ def __init__(self,
         assert (
             self.head_dim * self.num_heads == self.embed_dim
         ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
-        self.scale = self.head_dim**(-0.5)
+        self.scale = self.head_dim ** (-0.5)
         self.dropout = dropout
 
         self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
@@ -148,9 +138,7 @@ def __init__(self,
         self._reset_parameters()
 
     def _shape(self, tensor, seq_len, bsz):
-        return tensor.reshape(
-            [bsz, seq_len, self.num_heads, self.head_dim]).transpose(
-                [0, 2, 1, 3])
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
 
     def _reset_parameters(self):
         xavier_uniform_(self.v_proj.weight)
@@ -187,16 +175,13 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
         value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len,
-                                   bsz).reshape(proj_shape)
+        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
         key_states = key_states.reshape(proj_shape)
         value_v_states = value_v_states.reshape(proj_shape)
         value_l_states = value_l_states.reshape(proj_shape)
 
         src_len = key_states.shape[1]
-        attn_weights = paddle.bmm(
-            query_states,
-            key_states.transpose([0, 2, 1]))  # bs*nhead, nimg, ntxt
+        attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1]))  # bs*nhead, nimg, ntxt
 
         if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
             raise ValueError(
@@ -216,8 +201,7 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
             )  # Do not increase 50000, data type half has quite limited range
 
         attn_weights_T = attn_weights.transpose([0, 2, 1])
-        attn_weights_l = attn_weights_T - paddle.max(
-            attn_weights_T, axis=-1, keepdim=True)
+        attn_weights_l = attn_weights_T - paddle.max(attn_weights_T, axis=-1, keepdim=True)
         if self.clamp_min_for_underflow:
             attn_weights_l = paddle.clip(
                 attn_weights_l, min=-50000
@@ -230,53 +214,43 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
         # mask vison for language
         if attention_mask_v is not None:
 
-            attention_mask_v = (attention_mask_v[:, None, None, :]
-                                .cast(paddle.float32)
-                                .tile([1, self.num_heads, 1, 1]).flatten(0, 1))
-            attn_weights_l = masked_fill(attn_weights_l,
-                                         attention_mask_v == 1.0, float("-inf"))
+            attention_mask_v = (
+                attention_mask_v[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1)
+            )
+            attn_weights_l = masked_fill(attn_weights_l, attention_mask_v == 1.0, float("-inf"))
 
         attn_weights_l = F.softmax(attn_weights_l, axis=-1)
 
         # mask language for vision
         if attention_mask_l is not None:
-            attention_mask_l = (attention_mask_l[:, None, None, :]
-                                .cast(paddle.float32)
-                                .tile([1, self.num_heads, 1, 1]).flatten(0, 1))
-            attn_weights = masked_fill(attn_weights, attention_mask_l == 1.0,
-                                       float("-inf"))
+            attention_mask_l = (
+                attention_mask_l[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1)
+            )
+            attn_weights = masked_fill(attn_weights, attention_mask_l == 1.0, float("-inf"))
 
         attn_weights_v = F.softmax(attn_weights, axis=-1)
 
-        attn_probs_v = F.dropout(
-            attn_weights_v, p=self.dropout, training=self.training)
-        attn_probs_l = F.dropout(
-            attn_weights_l, p=self.dropout, training=self.training)
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
 
         attn_output_v = paddle.bmm(attn_probs_v, value_l_states)
         attn_output_l = paddle.bmm(attn_probs_l, value_v_states)
 
-        if attn_output_v.shape != [
-                bsz * self.num_heads, tgt_len, self.head_dim
-        ]:
+        if attn_output_v.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
             raise ValueError(
                 f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.shape}"
             )
 
-        if attn_output_l.shape != [
-                bsz * self.num_heads, src_len, self.head_dim
-        ]:
+        if attn_output_l.shape != [bsz * self.num_heads, src_len, self.head_dim]:
             raise ValueError(
                 f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.shape}"
             )
 
-        attn_output_v = attn_output_v.reshape(
-            [bsz, self.num_heads, tgt_len, self.head_dim])
+        attn_output_v = attn_output_v.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
         attn_output_v = attn_output_v.transpose([0, 2, 1, 3])
         attn_output_v = attn_output_v.reshape([bsz, tgt_len, self.embed_dim])
 
-        attn_output_l = attn_output_l.reshape(
-            [bsz, self.num_heads, src_len, self.head_dim])
+        attn_output_l = attn_output_l.reshape([bsz, self.num_heads, src_len, self.head_dim])
         attn_output_l = attn_output_l.transpose([0, 2, 1, 3])
         attn_output_l = attn_output_l.reshape([bsz, src_len, self.embed_dim])
 
@@ -289,15 +263,16 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
 # Bi-Direction MHA (text->image, image->text)
 class BiAttentionBlock(nn.Layer):
     def __init__(
-            self,
-            v_dim,
-            l_dim,
-            embed_dim,
-            num_heads,
-            dropout=0.1,
-            drop_path=0.0,
-            init_values=1e-4,
-            cfg=None, ):
+        self,
+        v_dim,
+        l_dim,
+        embed_dim,
+        num_heads,
+        dropout=0.1,
+        drop_path=0.0,
+        init_values=1e-4,
+        cfg=None,
+    ):
         """
         Inputs:
             embed_dim - Dimensionality of input and attention feature vectors
@@ -316,26 +291,24 @@ def __init__(
             l_dim=l_dim,
             embed_dim=embed_dim,
             num_heads=num_heads,
-            dropout=dropout, )
+            dropout=dropout,
+        )
 
         # add layer scale for training stability
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.gamma_v = self.create_parameter(
             shape=[v_dim],
-            attr=paddle.ParamAttr(initializer=Constant(init_values)), )
+            attr=paddle.ParamAttr(initializer=Constant(init_values)),
+        )
         self.gamma_l = self.create_parameter(
             shape=[l_dim],
-            attr=paddle.ParamAttr(initializer=Constant(init_values)), )
+            attr=paddle.ParamAttr(initializer=Constant(init_values)),
+        )
 
     def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
         v = self.layer_norm_v(v)
         l = self.layer_norm_l(l)
-        delta_v, delta_l = self.attn(
-            v,
-            l,
-            attention_mask_v=attention_mask_v,
-            attention_mask_l=attention_mask_l)
+        delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
         # v, l = v + delta_v, l + delta_l
         v = v + self.drop_path(self.gamma_v * delta_v)
         l = l + self.drop_path(self.gamma_l * delta_l)
diff --git a/paddlemix/models/groundingdino/layers.py b/paddlemix/models/groundingdino/layers.py
index f8d4c01da9460..7e15936a14b35 100644
--- a/paddlemix/models/groundingdino/layers.py
+++ b/paddlemix/models/groundingdino/layers.py
@@ -94,13 +94,14 @@ class MultiHeadAttention(nn.Layer):
     """
 
     def __init__(
-            self,
-            embed_dim,
-            num_heads,
-            dropout=0.0,
-            kdim=None,
-            vdim=None,
-            need_weights=False, ):
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+    ):
         super(MultiHeadAttention, self).__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -112,20 +113,18 @@ def __init__(
         self.need_weights = need_weights
 
         self.head_dim = embed_dim // num_heads
-        assert (self.head_dim * num_heads == self.embed_dim
-                ), "embed_dim must be divisible by num_heads"
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
 
         if self._qkv_same_embed_dim:
             self.in_proj_weight = self.create_parameter(
                 shape=[embed_dim, 3 * embed_dim],
                 attr=None,
                 dtype=self._dtype,
-                is_bias=False, )
+                is_bias=False,
+            )
             self.in_proj_bias = self.create_parameter(
-                shape=[3 * embed_dim],
-                attr=None,
-                dtype=self._dtype,
-                is_bias=True)
+                shape=[3 * embed_dim], attr=None, dtype=self._dtype, is_bias=True
+            )
         else:
             self.q_proj = nn.Linear(embed_dim, embed_dim)
             self.k_proj = nn.Linear(self.kdim, embed_dim)
@@ -147,15 +146,14 @@ def compute_qkv(self, tensor, index):
         if self._qkv_same_embed_dim:
             tensor = F.linear(
                 x=tensor,
-                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
-                                           * self.embed_dim],
-                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
-                                       self.embed_dim]
-                if self.in_proj_bias is not None else None, )
+                weight=self.in_proj_weight[:, index * self.embed_dim : (index + 1) * self.embed_dim],
+                bias=self.in_proj_bias[index * self.embed_dim : (index + 1) * self.embed_dim]
+                if self.in_proj_bias is not None
+                else None,
+            )
         else:
             tensor = getattr(self, self._type_list[index])(tensor)
-        tensor = tensor.reshape(
-            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        tensor = tensor.reshape([0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
         return tensor
 
     def forward(self, query, key=None, value=None, attn_mask=None):
@@ -201,12 +199,11 @@ def forward(self, query, key=None, value=None, attn_mask=None):
         key = query if key is None else key
         value = query if value is None else value
         # compute q ,k ,v
-        q, k, v = (self.compute_qkv(t, i)
-                   for i, t in enumerate([query, key, value]))
+        q, k, v = (self.compute_qkv(t, i) for i, t in enumerate([query, key, value]))
 
         # scale dot product attention
         product = paddle.matmul(x=q, y=k, transpose_y=True)
-        scaling = float(self.head_dim)**-0.5
+        scaling = float(self.head_dim) ** -0.5
         product = product * scaling
 
         if attn_mask is not None:
@@ -215,11 +212,7 @@ def forward(self, query, key=None, value=None, attn_mask=None):
             product = product + attn_mask
         weights = F.softmax(product)
         if self.dropout:
-            weights = F.dropout(
-                weights,
-                self.dropout,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train")
 
         out = paddle.matmul(weights, v)
 
@@ -236,10 +229,7 @@ def forward(self, query, key=None, value=None, attn_mask=None):
         return out if len(outs) == 1 else tuple(outs)
 
 
-def drop_path(x,
-              drop_prob: float=0.0,
-              training: bool=False,
-              scale_by_keep: bool=True):
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
     This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
@@ -252,21 +242,17 @@ def drop_path(x,
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
-    shape = (x.shape[0], ) + (1, ) * (
-        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = paddle.bernoulli(
-        paddle.full(
-            shape, keep_prob, dtype=x.dtype))
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = paddle.bernoulli(paddle.full(shape, keep_prob, dtype=x.dtype))
     if keep_prob > 0.0 and scale_by_keep:
-        random_tensor = paddle.divide(random_tensor,
-                                      paddle.to_tensor(keep_prob))
+        random_tensor = paddle.divide(random_tensor, paddle.to_tensor(keep_prob))
     return x * random_tensor
 
 
 class DropPath(nn.Layer):
     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
 
-    def __init__(self, drop_prob: float=0.0, scale_by_keep: bool=True):
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
         super(DropPath, self).__init__()
         self.drop_prob = drop_prob
         self.scale_by_keep = scale_by_keep
diff --git a/paddlemix/models/groundingdino/modeling.py b/paddlemix/models/groundingdino/modeling.py
index 58d778b122f28..90f4f25a2f20d 100644
--- a/paddlemix/models/groundingdino/modeling.py
+++ b/paddlemix/models/groundingdino/modeling.py
@@ -18,16 +18,12 @@
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle import Tensor
-from paddle.nn import Layer
-from paddlenlp.transformers import AutoTokenizer, BertModel, RobertaModel
-from paddlenlp.transformers.model_utils import (PretrainedModel,
-                                                register_base_model)
+from paddlenlp.transformers import BertModel, RobertaModel
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
 from paddlenlp.utils.initializer import constant_, xavier_uniform_
 
 from .backbone import build_backbone
-from .bertwarper import (BertModelWarper, generate_masks_with_special_tokens,
-                         generate_masks_with_special_tokens_and_transfer_map)
+from .bertwarper import BertModelWarper
 from .configuration import GroundingDinoConfig
 from .transformer import build_transformer
 from .utils import MLP, ContrastiveEmbed, inverse_sigmoid
@@ -75,14 +71,12 @@ def __init__(self, config: GroundingDinoConfig):
         elif config.text_encoder_type == "roberta-base":
             self.bert = RobertaModel.from_pretrained(config.text_encoder_type)
         else:
-            raise ValueError("Unknown text_encoder_type {}".format(
-                config.text_encoder_type))
+            raise ValueError("Unknown text_encoder_type {}".format(config.text_encoder_type))
         self.bert.pooler.dense.weight.stop_gradient = True
         self.bert.pooler.dense.bias.stop_gradient = True
         self.bert = BertModelWarper(bert_model=self.bert)
 
-        self.feat_map = nn.Linear(
-            self.bert.config.hidden_size, self.hidden_dim, bias_attr=True)
+        self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias_attr=True)
         constant_(self.feat_map.bias, 0)
         xavier_uniform_(self.feat_map.weight)
 
@@ -94,32 +88,29 @@ def __init__(self, config: GroundingDinoConfig):
                 in_channels = self.backbone.num_channels[_]
                 input_proj_list.append(
                     nn.Sequential(
-                        nn.Conv2D(
-                            in_channels, hidden_dim, kernel_size=1),
-                        nn.GroupNorm(32, hidden_dim), ))
+                        nn.Conv2D(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
             for _ in range(config.num_feature_levels - num_backbone_outs):
                 input_proj_list.append(
                     nn.Sequential(
-                        nn.Conv2D(
-                            in_channels,
-                            hidden_dim,
-                            kernel_size=3,
-                            stride=2,
-                            padding=1),
-                        nn.GroupNorm(32, hidden_dim), ))
+                        nn.Conv2D(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
                 in_channels = hidden_dim
             self.input_proj = nn.LayerList(input_proj_list)
         else:
-            assert (two_stage_type == "no"
-                    ), "two_stage_type should be no if num_feature_levels=1 !!!"
-            self.input_proj = nn.LayerList([
-                nn.Sequential(
-                    nn.Conv2D(
-                        self.backbone.num_channels[-1],
-                        hidden_dim,
-                        kernel_size=1),
-                    nn.GroupNorm(32, hidden_dim), )
-            ])
+            # assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!"
+            self.input_proj = nn.LayerList(
+                [
+                    nn.Sequential(
+                        nn.Conv2D(self.backbone.num_channels[-1], hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                ]
+            )
 
         # prepare class & box embed
         _class_embed = ContrastiveEmbed()
@@ -129,17 +120,10 @@ def __init__(self, config: GroundingDinoConfig):
         constant_(_bbox_embed.layers[-1].bias, 0)
 
         if config.dec_pred_bbox_embed_share:
-            box_embed_layerlist = [
-                _bbox_embed for i in range(self.transformer.num_decoder_layers)
-            ]
+            box_embed_layerlist = [_bbox_embed for i in range(self.transformer.num_decoder_layers)]
         else:
-            box_embed_layerlist = [
-                copy.deepcopy(_bbox_embed)
-                for i in range(self.transformer.num_decoder_layers)
-            ]
-        class_embed_layerlist = [
-            _class_embed for i in range(self.transformer.num_decoder_layers)
-        ]
+            box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(self.transformer.num_decoder_layers)]
+        class_embed_layerlist = [_class_embed for i in range(self.transformer.num_decoder_layers)]
         self.bbox_embed = nn.LayerList(box_embed_layerlist)
         self.class_embed = nn.LayerList(class_embed_layerlist)
         self.transformer.decoder.bbox_embed = self.bbox_embed
@@ -161,8 +145,7 @@ def __init__(self, config: GroundingDinoConfig):
                 assert config.dec_pred_bbox_embed_share
                 self.transformer.enc_out_class_embed = _class_embed
             else:
-                self.transformer.enc_out_class_embed = copy.deepcopy(
-                    _class_embed)
+                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
 
             self.refpoint_embed = None
 
@@ -178,14 +161,15 @@ def init_ref_points(self, use_num_queries):
         self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
 
     def forward(
-            self,
-            x: paddle.Tensor,
-            m: paddle.Tensor,
-            input_ids: paddle.Tensor,
-            attention_mask: paddle.Tensor,
-            text_self_attention_masks: paddle.Tensor,
-            position_ids: paddle.Tensor=None,
-            targets: List=None, ):
+        self,
+        x: paddle.Tensor,
+        m: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        text_self_attention_masks: paddle.Tensor,
+        position_ids: paddle.Tensor = None,
+        targets: List = None,
+    ):
 
         tokenized = {
             "input_ids": input_ids,
@@ -194,10 +178,7 @@ def forward(
 
         # extract text embeddings
         if self.sub_sentence_present:
-            tokenized_for_encoder = {
-                k: v
-                for k, v in tokenized.items() if k != "attention_mask"
-            }
+            tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
             tokenized_for_encoder["attention_mask"] = text_self_attention_masks
             tokenized_for_encoder["position_ids"] = position_ids
         else:
@@ -206,28 +187,22 @@ def forward(
 
         bert_output = self.bert(**tokenized_for_encoder)  # bs, 195, 768
 
-        encoded_text = self.feat_map(
-            bert_output["last_hidden_state"])  # bs, 195, d_model
-        text_token_mask = tokenized["attention_mask"].cast(
-            paddle.bool)  # bs, 195
+        encoded_text = self.feat_map(bert_output["last_hidden_state"])  # bs, 195, d_model
+        text_token_mask = tokenized["attention_mask"].cast(paddle.bool)  # bs, 195
         # text_token_mask: True for nomask, False for mask
         # text_self_attention_masks: True for nomask, False for mask
 
         if encoded_text.shape[1] > self.max_text_len:
-            encoded_text = encoded_text[:, :self.max_text_len, :]
-            text_token_mask = text_token_mask[:, :self.max_text_len]
-            position_ids = position_ids[:, :self.max_text_len]
-            text_self_attention_masks = text_self_attention_masks[:, :self.
-                                                                  max_text_len, :
-                                                                  self.
-                                                                  max_text_len]
+            encoded_text = encoded_text[:, : self.max_text_len, :]
+            text_token_mask = text_token_mask[:, : self.max_text_len]
+            position_ids = position_ids[:, : self.max_text_len]
+            text_self_attention_masks = text_self_attention_masks[:, : self.max_text_len, : self.max_text_len]
 
         text_dict = {
             "encoded_text": encoded_text,  # bs, 195, d_model
             "text_token_mask": text_token_mask,  # bs, 195
             "position_ids": position_ids,  # bs, 195
-            "text_self_attention_masks":
-            text_self_attention_masks,  # bs, 195,195
+            "text_self_attention_masks": text_self_attention_masks,  # bs, 195,195
         }
 
         features, feat_masks, poss = self.backbone(x, m)
@@ -249,40 +224,35 @@ def forward(
                 else:
                     src = self.input_proj[l](srcs[-1])
                 # m = samples.mask
-                mask = F.interpolate(
-                    m[None].cast(paddle.float32),
-                    size=src.shape[-2:]).cast(paddle.bool)[0]
+                mask = F.interpolate(m[None].cast(paddle.float32), size=src.shape[-2:]).cast(paddle.bool)[0]
                 # pos_l = self.backbone[1](NestedTensor(src, mask)).cast(src.dtype)
                 pos_l = self.backbone[1](mask).cast(src.dtype)
                 srcs.append(src)
                 masks.append(mask)
                 poss.append(pos_l)
 
-        input_query_bbox = input_query_label = attn_mask = dn_meta = None
+        # input_query_bbox = input_query_label = attn_mask = dn_meta = None
+        input_query_bbox = input_query_label = attn_mask = None
         hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
-            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask,
-            text_dict)
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict
+        )
 
         # deformable-detr-like anchor update
         outputs_coord_list = []
-        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs
-                      ) in enumerate(zip(reference[:-1], self.bbox_embed, hs)):
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(
+            zip(reference[:-1], self.bbox_embed, hs)
+        ):
             layer_delta_unsig = layer_bbox_embed(layer_hs)
-            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(
-                layer_ref_sig)
+            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
             layer_outputs_unsig = F.sigmoid(layer_outputs_unsig)
             outputs_coord_list.append(layer_outputs_unsig)
         outputs_coord_list = paddle.stack(outputs_coord_list)
 
         # output
-        outputs_class = paddle.stack([
-            layer_cls_embed(layer_hs, text_dict)
-            for layer_cls_embed, layer_hs in zip(self.class_embed, hs)
-        ])
-
-        out = {
-            "pred_logits": outputs_class[-1],
-            "pred_boxes": outputs_coord_list[-1]
-        }
+        outputs_class = paddle.stack(
+            [layer_cls_embed(layer_hs, text_dict) for layer_cls_embed, layer_hs in zip(self.class_embed, hs)]
+        )
+
+        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]}
 
         return out
diff --git a/paddlemix/models/groundingdino/ms_deform_attn.py b/paddlemix/models/groundingdino/ms_deform_attn.py
index 747a7543977fa..5d29c9ea3e20f 100644
--- a/paddlemix/models/groundingdino/ms_deform_attn.py
+++ b/paddlemix/models/groundingdino/ms_deform_attn.py
@@ -24,17 +24,17 @@
 # helpers
 def _is_power_of_2(n):
     if (not isinstance(n, int)) or (n < 0):
-        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".
-                         format(n, type(n)))
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
     return (n & (n - 1) == 0) and n != 0
 
 
 def deformable_attention_core_func(
-        value,
-        value_spatial_shapes,
-        value_level_start_index,
-        sampling_locations,
-        attention_weights, ):
+    value,
+    value_spatial_shapes,
+    value_level_start_index,
+    sampling_locations,
+    attention_weights,
+):
     """
     Args:
         value (Tensor): [bs, value_length, n_head, c]
@@ -49,44 +49,46 @@ def deformable_attention_core_func(
     bs, _, n_head, c = value.shape
     _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
 
-    value_list = value.split(
-        value_spatial_shapes.prod(1).split(n_levels), axis=1)
+    value_list = value.split(value_spatial_shapes.prod(1).split(n_levels), axis=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level, (h, w) in enumerate(value_spatial_shapes):
         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = (value_list[level].flatten(2).transpose([0, 2, 1])
-                    .reshape([bs * n_head, c, h, w]))
+        value_l_ = value_list[level].flatten(2).transpose([0, 2, 1]).reshape([bs * n_head, c, h, w])
         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = (sampling_grids[:, :, :, level].transpose(
-            [0, 2, 1, 3, 4]).flatten(0, 1))
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose([0, 2, 1, 3, 4]).flatten(0, 1)
         # N_*M_, D_, Lq_, P_
         sampling_value_l_ = F.grid_sample(
             value_l_,
             sampling_grid_l_,
             mode="bilinear",
             padding_mode="zeros",
-            align_corners=False, )
+            align_corners=False,
+        )
         sampling_value_list.append(sampling_value_l_)
     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
     attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
-        [bs * n_head, 1, Len_q, n_levels * n_points])
-    output = ((paddle.stack(
-        sampling_value_list, axis=-2).flatten(-2) * attention_weights).sum(-1)
-              .reshape([bs, n_head * c, Len_q]))
+        [bs * n_head, 1, Len_q, n_levels * n_points]
+    )
+    output = (
+        (paddle.stack(sampling_value_list, axis=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .reshape([bs, n_head * c, Len_q])
+    )
 
     return output.transpose([0, 2, 1])
 
 
 class MSDeformableAttention(nn.Layer):
     def __init__(
-            self,
-            embed_dim=256,
-            num_heads=8,
-            num_levels=4,
-            num_points=4,
-            lr_mult=0.1,
-            batch_first=False, ):
+        self,
+        embed_dim=256,
+        num_heads=8,
+        num_levels=4,
+        num_points=4,
+        lr_mult=0.1,
+        batch_first=False,
+    ):
         """
         Multi-Scale Deformable Attention Module
         """
@@ -98,14 +100,14 @@ def __init__(
         self.total_points = num_heads * num_levels * num_points
 
         self.head_dim = embed_dim // num_heads
-        assert (self.head_dim * num_heads == self.embed_dim
-                ), "embed_dim must be divisible by num_heads"
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
 
         self.sampling_offsets = nn.Linear(
             embed_dim,
             self.total_points * 2,
             weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=ParamAttr(learning_rate=lr_mult), )
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
 
         self.attention_weights = nn.Linear(embed_dim, self.total_points)
         self.value_proj = nn.Linear(embed_dim, embed_dim)
@@ -124,16 +126,11 @@ def __init__(
     def _reset_parameters(self):
         # sampling_offsets
         constant_(self.sampling_offsets.weight)
-        thetas = paddle.arange(
-            self.num_heads,
-            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
+        thetas = paddle.arange(self.num_heads, dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
         grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
-        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
-            [1, self.num_levels, self.num_points, 1])
-        scaling = paddle.arange(
-            1, self.num_points + 1,
-            dtype=paddle.float32).reshape([1, 1, -1, 1])
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile([1, self.num_levels, self.num_points, 1])
+        scaling = paddle.arange(1, self.num_points + 1, dtype=paddle.float32).reshape([1, 1, -1, 1])
         grid_init *= scaling
         self.sampling_offsets.bias.set_value(grid_init.flatten())
         # attention_weights
@@ -146,13 +143,14 @@ def _reset_parameters(self):
         constant_(self.output_proj.bias)
 
     def forward(
-            self,
-            query,
-            reference_points,
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            value_mask=None, ):
+        self,
+        query,
+        reference_points,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        value_mask=None,
+    ):
         """
         Args:
             query (Tensor): [bs, query_length, C]
@@ -182,34 +180,37 @@ def forward(
         value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
 
         sampling_offsets = self.sampling_offsets(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]
+        )
         attention_weights = self.attention_weights(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points]
+        )
         attention_weights = F.softmax(attention_weights).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points]
+        )
 
         if reference_points.shape[-1] == 2:
-            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
-                [1, 1, 1, self.num_levels, 1, 2])
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape([1, 1, 1, self.num_levels, 1, 2])
             sampling_locations = (
-                reference_points.reshape([bs, Len_q, 1, self.num_levels, 1, 2])
-                + sampling_offsets / offset_normalizer)
+                reference_points.reshape([bs, Len_q, 1, self.num_levels, 1, 2]) + sampling_offsets / offset_normalizer
+            )
         elif reference_points.shape[-1] == 4:
             sampling_locations = (
-                reference_points[:, :, None, :, None, :2] + sampling_offsets /
-                self.num_points * reference_points[:, :, None, :, None, 2:] *
-                0.5)
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
         else:
             raise ValueError(
-                "Last dim of reference_points must be 2 or 4, but get {} instead.".
-                format(reference_points.shape[-1]))
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(reference_points.shape[-1])
+            )
 
         output = self.ms_deformable_attn_core(
             value,
             value_spatial_shapes.astype("int64"),
             value_level_start_index.astype("int64"),
             sampling_locations,
-            attention_weights, )
+            attention_weights,
+        )
         output = self.output_proj(output)
 
         if not self.batch_first:
diff --git a/paddlemix/models/groundingdino/transformer.py b/paddlemix/models/groundingdino/transformer.py
index dca5d32a59662..3b80821551e13 100644
--- a/paddlemix/models/groundingdino/transformer.py
+++ b/paddlemix/models/groundingdino/transformer.py
@@ -14,56 +14,62 @@
 
 from typing import Optional
 
-import numpy as np
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.utils import recompute
-from paddlenlp.utils.initializer import constant_, normal_, xavier_uniform_
+from paddlenlp.utils.initializer import normal_, xavier_uniform_
 
 from .fuse_modules import BiAttentionBlock
 from .layers import MultiHeadAttention
 from .ms_deform_attn import MSDeformableAttention as MSDeformAttn
 from .transformer_vanilla import TransformerEncoderLayer
-from .utils import (MLP, _get_activation_fn, _get_clones,
-                    gen_encoder_output_proposals, gen_sineembed_for_position,
-                    get_sine_pos_embed, inverse_sigmoid)
+from .utils import (
+    MLP,
+    _get_activation_fn,
+    _get_clones,
+    gen_encoder_output_proposals,
+    gen_sineembed_for_position,
+    get_sine_pos_embed,
+    inverse_sigmoid,
+)
 
 
 class Transformer(nn.Layer):
     def __init__(
-            self,
-            d_model=256,
-            nhead=8,
-            num_queries=300,
-            num_encoder_layers=6,
-            num_unicoder_layers=0,
-            num_decoder_layers=6,
-            dim_feedforward=2048,
-            dropout=0.0,
-            activation="relu",
-            normalize_before=False,
-            return_intermediate_dec=False,
-            query_dim=4,
-            num_patterns=0,
-            # for deformable encoder
-            num_feature_levels=1,
-            enc_n_points=4,
-            dec_n_points=4,
-            # init query
-            learnable_tgt_init=False,
-            # two stage
-            two_stage_type="no",
-            embed_init_tgt=False,
-            # for text
-            use_text_enhancer=False,
-            use_fusion_layer=False,
-            use_checkpoint=False,
-            use_transformer_ckpt=False,
-            use_text_cross_attention=False,
-            text_dropout=0.1,
-            fusion_dropout=0.1,
-            fusion_droppath=0.0, ):
+        self,
+        d_model=256,
+        nhead=8,
+        num_queries=300,
+        num_encoder_layers=6,
+        num_unicoder_layers=0,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.0,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+        query_dim=4,
+        num_patterns=0,
+        # for deformable encoder
+        num_feature_levels=1,
+        enc_n_points=4,
+        dec_n_points=4,
+        # init query
+        learnable_tgt_init=False,
+        # two stage
+        two_stage_type="no",
+        embed_init_tgt=False,
+        # for text
+        use_text_enhancer=False,
+        use_fusion_layer=False,
+        use_checkpoint=False,
+        use_transformer_ckpt=False,
+        use_text_cross_attention=False,
+        text_dropout=0.1,
+        fusion_dropout=0.1,
+        fusion_droppath=0.0,
+    ):
         super().__init__()
         self.num_feature_levels = num_feature_levels
         self.num_encoder_layers = num_encoder_layers
@@ -80,14 +86,16 @@ def __init__(
             activation,
             num_feature_levels,
             nhead,
-            enc_n_points, )
+            enc_n_points,
+        )
 
         if use_text_enhancer:
             text_enhance_layer = TransformerEncoderLayer(
                 d_model=d_model,
                 nhead=nhead // 2,
                 dim_feedforward=dim_feedforward // 2,
-                dropout=text_dropout, )
+                dropout=text_dropout,
+            )
         else:
             text_enhance_layer = None
 
@@ -98,7 +106,8 @@ def __init__(
                 embed_dim=dim_feedforward // 2,
                 num_heads=nhead // 2,
                 dropout=fusion_dropout,
-                drop_path=fusion_droppath, )
+                drop_path=fusion_droppath,
+            )
         else:
             feature_fusion_layer = None
 
@@ -112,7 +121,8 @@ def __init__(
             text_enhance_layer=text_enhance_layer,
             feature_fusion_layer=feature_fusion_layer,
             use_checkpoint=use_checkpoint,
-            use_transformer_ckpt=use_transformer_ckpt, )
+            use_transformer_ckpt=use_transformer_ckpt,
+        )
 
         # choose decoder layer type
         decoder_layer = DeformableTransformerDecoderLayer(
@@ -123,7 +133,8 @@ def __init__(
             num_feature_levels,
             nhead,
             dec_n_points,
-            use_text_cross_attention=use_text_cross_attention, )
+            use_text_cross_attention=use_text_cross_attention,
+        )
 
         decoder_norm = nn.LayerNorm(d_model)
         self.decoder = TransformerDecoder(
@@ -133,7 +144,8 @@ def __init__(
             return_intermediate=return_intermediate_dec,
             d_model=d_model,
             query_dim=query_dim,
-            num_feature_levels=num_feature_levels, )
+            num_feature_levels=num_feature_levels,
+        )
 
         self.d_model = d_model
         self.nhead = nhead
@@ -141,22 +153,19 @@ def __init__(
         self.num_queries = num_queries  # useful for single stage model only
         self.num_patterns = num_patterns
         if not isinstance(num_patterns, int):
-            Warning("num_patterns should be int but {}".format(
-                type(num_patterns)))
+            Warning("num_patterns should be int but {}".format(type(num_patterns)))
             self.num_patterns = 0
 
         if num_feature_levels > 1:
             if self.num_encoder_layers > 0:
-                self.level_embed = self.create_parameter(
-                    shape=[num_feature_levels, d_model])
+                self.level_embed = self.create_parameter(shape=[num_feature_levels, d_model])
             else:
                 self.level_embed = None
 
         self.learnable_tgt_init = learnable_tgt_init
         assert learnable_tgt_init, "why not learnable_tgt_init"
         self.embed_init_tgt = embed_init_tgt
-        if (two_stage_type != "no" and embed_init_tgt) or (
-                two_stage_type == "no"):
+        if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"):
             self.tgt_embed = nn.Embedding(self.num_queries, d_model)
             normal_(self.tgt_embed.weight)
         else:
@@ -205,14 +214,15 @@ def init_ref_points(self, use_num_queries):
         self.refpoint_embed = nn.Embedding(use_num_queries, 4)
 
     def forward(
-            self,
-            srcs,
-            masks,
-            refpoint_embed,
-            pos_embeds,
-            tgt,
-            attn_mask=None,
-            text_dict=None, ):
+        self,
+        srcs,
+        masks,
+        refpoint_embed,
+        pos_embeds,
+        tgt,
+        attn_mask=None,
+        text_dict=None,
+    ):
         """
         Input:
             - srcs: List of multi features [bs, ci, hi, wi]
@@ -227,18 +237,15 @@ def forward(
         mask_flatten = []
         lvl_pos_embed_flatten = []
         spatial_shapes = []
-        for lvl, (src, mask,
-                  pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
             bs, c, h, w = src.shape
             spatial_shapes.append(paddle.to_tensor([h, w]))
 
             src = src.flatten(2).transpose([0, 2, 1])  # bs, hw, c
-            mask = mask.cast(paddle.float32).flatten(1).cast(
-                paddle.bool)  # bs, hw
+            mask = mask.cast(paddle.float32).flatten(1).cast(paddle.bool)  # bs, hw
             pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])  # bs, hw, c
             if self.num_feature_levels > 1 and self.level_embed is not None:
-                lvl_pos_embed = pos_embed + self.level_embed[lvl].reshape(
-                    [1, 1, -1])
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].reshape([1, 1, -1])
             else:
                 lvl_pos_embed = pos_embed
             lvl_pos_embed_flatten.append(lvl_pos_embed)
@@ -246,20 +253,20 @@ def forward(
             mask_flatten.append(mask)
         src_flatten = paddle.concat(src_flatten, 1)  # bs, \sum{hxw}, c
         mask_flatten = paddle.concat(mask_flatten, 1)  # bs, \sum{hxw}
-        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten,
-                                              1)  # bs, \sum{hxw}, c
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
 
-        spatial_shapes = paddle.to_tensor(
-            paddle.stack(spatial_shapes), dtype=paddle.int32)
+        spatial_shapes = paddle.to_tensor(paddle.stack(spatial_shapes), dtype=paddle.int32)
 
-        level_start_index = paddle.concat((
-            paddle.zeros(
-                [1], dtype=spatial_shapes.dtype),
-            spatial_shapes.prod(1).cumsum(0)[:-1], ))
+        level_start_index = paddle.concat(
+            (
+                paddle.zeros([1], dtype=spatial_shapes.dtype),
+                spatial_shapes.prod(1).cumsum(0)[:-1],
+            )
+        )
         valid_ratios = paddle.stack([self.get_valid_ratio(m) for m in masks], 1)
 
         # two stage
-        enc_topk_proposals = enc_refpoint_embed = None
+        # enc_topk_proposals = enc_refpoint_embed = None
 
         #########################################################
         # Begin Encoder
@@ -275,7 +282,8 @@ def forward(
             text_attention_mask=~text_dict["text_token_mask"],
             # we ~ the mask . False means use the token; True means pad the token
             position_ids=text_dict["position_ids"],
-            text_self_attention_masks=text_dict["text_self_attention_masks"], )
+            text_self_attention_masks=text_dict["text_self_attention_masks"],
+        )
         #########################################################
         # End Encoder
         # - memory: bs, \sum{hw}, c
@@ -287,16 +295,13 @@ def forward(
         text_dict["encoded_text"] = memory_text
 
         if self.two_stage_type == "standard":
-            output_memory, output_proposals = gen_encoder_output_proposals(
-                memory, mask_flatten, spatial_shapes)
+            output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
             output_memory = self.enc_output_norm(self.enc_output(output_memory))
 
             if text_dict is not None:
-                enc_outputs_class_unselected = self.enc_out_class_embed(
-                    output_memory, text_dict)
+                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
             else:
-                enc_outputs_class_unselected = self.enc_out_class_embed(
-                    output_memory)
+                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
 
             topk_logits = enc_outputs_class_unselected.max(-1)
             enc_outputs_coord_unselected = (
@@ -306,47 +311,39 @@ def forward(
 
             topk_proposals = paddle.topk(topk_logits, topk, axis=1)[1]  # bs, nq
 
-            topk_ind = topk_proposals.unsqueeze(axis=-1).tile(
-                repeat_times=[1, 1, 4])
+            topk_ind = topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, 4])
 
             # gather boxes
             refpoint_embed_undetach = paddle.take_along_axis(
-                arr=enc_outputs_coord_unselected, axis=1, indices=topk_ind)
+                arr=enc_outputs_coord_unselected, axis=1, indices=topk_ind
+            )
 
             refpoint_embed_ = refpoint_embed_undetach
-            init_box_proposal = F.sigmoid(
-                paddle.take_along_axis(
-                    arr=output_proposals, axis=1, indices=topk_ind))
+            init_box_proposal = F.sigmoid(paddle.take_along_axis(arr=output_proposals, axis=1, indices=topk_ind))
 
             tgt_undetach = paddle.take_along_axis(
                 arr=output_memory,
                 axis=1,
-                indices=topk_proposals.unsqueeze(axis=-1).tile(
-                    repeat_times=[1, 1, self.d_model]), )
+                indices=topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, self.d_model]),
+            )
 
             if self.embed_init_tgt:
-                tgt_ = (self.tgt_embed.weight[:, None, :].tile([1, bs, 1])
-                        .transpose([1, 0, 2]))  # nq, bs, d_model
+                tgt_ = self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])  # nq, bs, d_model
             else:
                 tgt_ = tgt_undetach
 
             if refpoint_embed is not None:
-                refpoint_embed = paddle.concat(
-                    [refpoint_embed, refpoint_embed_], axis=1)
+                refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1)
                 tgt = paddle.concat([tgt, tgt_], axis=1)
             else:
                 refpoint_embed, tgt = refpoint_embed_, tgt_
 
         elif self.two_stage_type == "no":
-            tgt_ = (self.tgt_embed.weight[:, None, :].tile(
-                [1, bs, 1]).transpose([1, 0, 2]))  # nq, bs, d_model
-            refpoint_embed_ = (self.refpoint_embed.weight[:, None, :]
-                               .tile([1, bs, 1])
-                               .transpose([1, 0, 2]))  # nq, bs, 4
+            tgt_ = self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2])  # nq, bs, 4
 
             if refpoint_embed is not None:
-                refpoint_embed = paddle.concat(
-                    [refpoint_embed, refpoint_embed_], axis=1)
+                refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1)
                 tgt = paddle.concat([tgt, tgt_], axis=1)
             else:
                 refpoint_embed, tgt = refpoint_embed_, tgt_
@@ -355,14 +352,14 @@ def forward(
                 tgt_embed = tgt.tile([1, self.num_patterns, 1])
                 refpoint_embed = refpoint_embed.tile([1, self.num_patterns, 1])
                 tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(
-                    self.num_queries, 1)  # 1, n_q*n_pat, d_model
+                    self.num_queries, 1
+                )  # 1, n_q*n_pat, d_model
                 tgt = tgt_embed + tgt_pat
 
             init_box_proposal = F.sigmoid(refpoint_embed_)
 
         else:
-            raise NotImplementedError("unknown two_stage_type {}".format(
-                self.two_stage_type))
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
         #########################################################
         # End preparing tgt
         # - tgt: bs, NQ, d_model
@@ -416,16 +413,17 @@ def forward(
 
 class TransformerEncoder(nn.Layer):
     def __init__(
-            self,
-            encoder_layer,
-            num_layers,
-            d_model=256,
-            num_queries=300,
-            enc_layer_share=False,
-            text_enhance_layer=None,
-            feature_fusion_layer=None,
-            use_checkpoint=False,
-            use_transformer_ckpt=False, ):
+        self,
+        encoder_layer,
+        num_layers,
+        d_model=256,
+        num_queries=300,
+        enc_layer_share=False,
+        text_enhance_layer=None,
+        feature_fusion_layer=None,
+        use_checkpoint=False,
+        use_transformer_ckpt=False,
+    ):
         """_summary_
 
         Args:
@@ -443,17 +441,12 @@ def __init__(
         self.text_layers = []
         self.fusion_layers = []
         if num_layers > 0:
-            self.layers = _get_clones(
-                encoder_layer, num_layers, layer_share=enc_layer_share)
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
 
             if text_enhance_layer is not None:
-                self.text_layers = _get_clones(
-                    text_enhance_layer, num_layers, layer_share=enc_layer_share)
+                self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
             if feature_fusion_layer is not None:
-                self.fusion_layers = _get_clones(
-                    feature_fusion_layer,
-                    num_layers,
-                    layer_share=enc_layer_share)
+                self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
         else:
             self.layers = []
             del encoder_layer
@@ -479,14 +472,11 @@ def get_reference_points(spatial_shapes, valid_ratios):
         for lvl, (H_, W_) in enumerate(spatial_shapes):
 
             ref_y, ref_x = paddle.meshgrid(
-                paddle.linspace(
-                    0.5, H_ - 0.5, H_, dtype=paddle.float32),
-                paddle.linspace(
-                    0.5, W_ - 0.5, W_, dtype=paddle.float32), )
-            ref_y = ref_y.reshape([-1])[None] / (valid_ratios[:, None, lvl, 1] *
-                                                 H_)
-            ref_x = ref_x.reshape([-1])[None] / (valid_ratios[:, None, lvl, 0] *
-                                                 W_)
+                paddle.linspace(0.5, H_ - 0.5, H_, dtype=paddle.float32),
+                paddle.linspace(0.5, W_ - 0.5, W_, dtype=paddle.float32),
+            )
+            ref_y = ref_y.reshape([-1])[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape([-1])[None] / (valid_ratios[:, None, lvl, 0] * W_)
             ref = paddle.stack((ref_x, ref_y), -1)
             reference_points_list.append(ref)
         reference_points = paddle.concat(reference_points_list, 1)
@@ -494,20 +484,21 @@ def get_reference_points(spatial_shapes, valid_ratios):
         return reference_points
 
     def forward(
-            self,
-            # for images
-            src: paddle.Tensor,
-            pos: paddle.Tensor,
-            spatial_shapes: paddle.Tensor,
-            level_start_index: paddle.Tensor,
-            valid_ratios: paddle.Tensor,
-            key_padding_mask: paddle.Tensor,
-            # for texts
-            memory_text: paddle.Tensor=None,
-            text_attention_mask: paddle.Tensor=None,
-            pos_text: paddle.Tensor=None,
-            text_self_attention_masks: paddle.Tensor=None,
-            position_ids: paddle.Tensor=None, ):
+        self,
+        # for images
+        src: paddle.Tensor,
+        pos: paddle.Tensor,
+        spatial_shapes: paddle.Tensor,
+        level_start_index: paddle.Tensor,
+        valid_ratios: paddle.Tensor,
+        key_padding_mask: paddle.Tensor,
+        # for texts
+        memory_text: paddle.Tensor = None,
+        text_attention_mask: paddle.Tensor = None,
+        pos_text: paddle.Tensor = None,
+        text_self_attention_masks: paddle.Tensor = None,
+        position_ids: paddle.Tensor = None,
+    ):
         """
         Input:
             - src: [bs, sum(hi*wi), 256]
@@ -533,22 +524,16 @@ def forward(
 
         # preparation and reshape
         if self.num_layers > 0:
-            reference_points = self.get_reference_points(spatial_shapes,
-                                                         valid_ratios)
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios)
 
         if self.text_layers:
             # generate pos_text
             bs, n_text, text_dim = memory_text.shape
             if pos_text is None and position_ids is None:
-                pos_text = (paddle.arange(n_text).cast(paddle.float32)
-                            .unsqueeze(0).unsqueeze(-1).tile([bs, 1, 1]))
-                pos_text = get_sine_pos_embed(
-                    pos_text, num_pos_feats=256, exchange_xy=False)
+                pos_text = paddle.arange(n_text).cast(paddle.float32).unsqueeze(0).unsqueeze(-1).tile([bs, 1, 1])
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
             if position_ids is not None:
-                pos_text = get_sine_pos_embed(
-                    position_ids[..., None],
-                    num_pos_feats=256,
-                    exchange_xy=False)
+                pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
 
         # main process
         for layer_id, layer in enumerate(self.layers):
@@ -560,20 +545,23 @@ def forward(
                         memory_text,
                         key_padding_mask,
                         text_attention_mask,
-                        **{"preserve_rng_state": True}, )
+                        **{"preserve_rng_state": True},
+                    )
                 else:
                     output, memory_text = self.fusion_layers[layer_id](
                         v=output,
                         l=memory_text,
                         attention_mask_v=key_padding_mask,
-                        attention_mask_l=text_attention_mask, )
+                        attention_mask_l=text_attention_mask,
+                    )
 
             if self.text_layers:
                 memory_text = self.text_layers[layer_id](
                     src=memory_text,
                     src_mask=text_self_attention_masks,  # note we use ~ for mask here
                     src_key_padding_mask=text_attention_mask,
-                    pos=(pos_text if pos_text is not None else None), )
+                    pos=(pos_text if pos_text is not None else None),
+                )
 
             # main process
             if self.use_transformer_ckpt:
@@ -585,7 +573,8 @@ def forward(
                     spatial_shapes,
                     level_start_index,
                     key_padding_mask,
-                    **{"preserve_rng_state": True}, )
+                    **{"preserve_rng_state": True},
+                )
             else:
                 output = layer(
                     src=output,
@@ -593,21 +582,23 @@ def forward(
                     reference_points=reference_points,
                     spatial_shapes=spatial_shapes,
                     level_start_index=level_start_index,
-                    key_padding_mask=key_padding_mask, )
+                    key_padding_mask=key_padding_mask,
+                )
 
         return output, memory_text
 
 
 class TransformerDecoder(nn.Layer):
     def __init__(
-            self,
-            decoder_layer,
-            num_layers,
-            norm=None,
-            return_intermediate=False,
-            d_model=256,
-            query_dim=4,
-            num_feature_levels=1, ):
+        self,
+        decoder_layer,
+        num_layers,
+        norm=None,
+        return_intermediate=False,
+        d_model=256,
+        query_dim=4,
+        num_feature_levels=1,
+    ):
         super().__init__()
         if num_layers > 0:
             self.layers = _get_clones(decoder_layer, num_layers)
@@ -618,8 +609,7 @@ def __init__(
         self.return_intermediate = return_intermediate
         assert return_intermediate, "support return_intermediate only"
         self.query_dim = query_dim
-        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(
-            query_dim)
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
         self.num_feature_levels = num_feature_levels
 
         self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
@@ -634,23 +624,23 @@ def __init__(
         self.ref_anchor_head = None
 
     def forward(
-            self,
-            tgt,
-            memory,
-            tgt_mask: Optional[paddle.Tensor]=None,
-            memory_mask: Optional[paddle.Tensor]=None,
-            tgt_key_padding_mask: Optional[paddle.Tensor]=None,
-            memory_key_padding_mask: Optional[paddle.Tensor]=None,
-            pos: Optional[paddle.Tensor]=None,
-            refpoints_unsigmoid: Optional[
-                paddle.Tensor]=None,  # num_queries, bs, 2
-            # for memory
-            level_start_index: Optional[paddle.Tensor]=None,  # num_levels
-            spatial_shapes: Optional[paddle.Tensor]=None,  # bs, num_levels, 2
-            valid_ratios: Optional[paddle.Tensor]=None,
-            # for text
-            memory_text: Optional[paddle.Tensor]=None,
-            text_attention_mask: Optional[paddle.Tensor]=None, ):
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[paddle.Tensor] = None,
+        memory_mask: Optional[paddle.Tensor] = None,
+        tgt_key_padding_mask: Optional[paddle.Tensor] = None,
+        memory_key_padding_mask: Optional[paddle.Tensor] = None,
+        pos: Optional[paddle.Tensor] = None,
+        refpoints_unsigmoid: Optional[paddle.Tensor] = None,  # num_queries, bs, 2
+        # for memory
+        level_start_index: Optional[paddle.Tensor] = None,  # num_levels
+        spatial_shapes: Optional[paddle.Tensor] = None,  # bs, num_levels, 2
+        valid_ratios: Optional[paddle.Tensor] = None,
+        # for text
+        memory_text: Optional[paddle.Tensor] = None,
+        text_attention_mask: Optional[paddle.Tensor] = None,
+    ):
         """
         Input:
             - tgt: nq, bs, d_model
@@ -669,20 +659,16 @@ def forward(
 
             if reference_points.shape[-1] == 4:
                 reference_points_input = (
-                    reference_points[:, :, None] *
-                    paddle.concat([valid_ratios, valid_ratios], -1)[None, :]
+                    reference_points[:, :, None] * paddle.concat([valid_ratios, valid_ratios], -1)[None, :]
                 )  # nq, bs, nlevel, 4
             else:
                 assert reference_points.shape[-1] == 2
-                reference_points_input = (reference_points[:, :, None] *
-                                          valid_ratios[None, :])
-            query_sine_embed = gen_sineembed_for_position(
-                reference_points_input[:, :, 0, :])  # nq, bs, 256*2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :])  # nq, bs, 256*2
 
             # conditional query
             raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
-            pos_scale = self.query_scale(
-                output) if self.query_scale is not None else 1
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
             query_pos = pos_scale * raw_query_pos
 
             # main process
@@ -700,10 +686,10 @@ def forward(
                 memory_spatial_shapes=spatial_shapes,
                 memory_pos=pos,
                 self_attn_mask=tgt_mask,
-                cross_attn_mask=memory_mask, )
+                cross_attn_mask=memory_mask,
+            )
 
-            if (output.isnan().any() |
-                    output.isinf().any()) and paddle.in_dynamic_mode():
+            if (output.isnan().any() | output.isinf().any()) and paddle.in_dynamic_mode():
                 print(f"output layer_id {layer_id} is nan")
                 try:
                     num_nan = output.isnan().sum().item()
@@ -734,14 +720,15 @@ def forward(
 
 class DeformableTransformerEncoderLayer(nn.Layer):
     def __init__(
-            self,
-            d_model=256,
-            d_ffn=1024,
-            dropout=0.1,
-            activation="relu",
-            n_levels=4,
-            n_heads=8,
-            n_points=4, ):
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+    ):
         super().__init__()
 
         # self attention
@@ -750,7 +737,8 @@ def __init__(
             num_levels=n_levels,
             num_heads=n_heads,
             num_points=n_points,
-            batch_first=True, )
+            batch_first=True,
+        )
         self.dropout1 = nn.Dropout(dropout)
         self.norm1 = nn.LayerNorm(d_model)
 
@@ -773,13 +761,14 @@ def forward_ffn(self, src):
         return src
 
     def forward(
-            self,
-            src,
-            pos,
-            reference_points,
-            spatial_shapes,
-            level_start_index,
-            key_padding_mask=None, ):
+        self,
+        src,
+        pos,
+        reference_points,
+        spatial_shapes,
+        level_start_index,
+        key_padding_mask=None,
+    ):
 
         src2 = self.self_attn(
             query=self.with_pos_embed(src, pos),
@@ -787,7 +776,8 @@ def forward(
             value=src,
             value_spatial_shapes=spatial_shapes,
             value_level_start_index=level_start_index,
-            value_mask=key_padding_mask, )
+            value_mask=key_padding_mask,
+        )
         src = src + self.dropout1(src2)
         src = self.norm1(src)
 
@@ -799,16 +789,17 @@ def forward(
 
 class DeformableTransformerDecoderLayer(nn.Layer):
     def __init__(
-            self,
-            d_model=256,
-            d_ffn=1024,
-            dropout=0.1,
-            activation="relu",
-            n_levels=4,
-            n_heads=8,
-            n_points=4,
-            use_text_feat_guide=False,
-            use_text_cross_attention=False, ):
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+        use_text_feat_guide=False,
+        use_text_cross_attention=False,
+    ):
         super().__init__()
 
         # cross attention
@@ -817,15 +808,15 @@ def __init__(
             num_levels=n_levels,
             num_heads=n_heads,
             num_points=n_points,
-            batch_first=True, )
+            batch_first=True,
+        )
         self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
         self.norm1 = nn.LayerNorm(d_model)
 
         # cross attention text
         if use_text_cross_attention:
             self.ca_text = MultiHeadAttention(d_model, n_heads, dropout=dropout)
-            self.catext_dropout = nn.Dropout(
-                dropout) if dropout > 0 else nn.Identity()
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
             self.catext_norm = nn.LayerNorm(d_model)
 
         # self attention
@@ -835,8 +826,7 @@ def __init__(
 
         # ffn
         self.linear1 = nn.Linear(d_model, d_ffn)
-        self.activation = _get_activation_fn(
-            activation, d_model=d_ffn, batch_dim=1)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
         self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
         self.linear2 = nn.Linear(d_ffn, d_model)
         self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
@@ -864,30 +854,24 @@ def forward_ffn(self, tgt):
         return tgt
 
     def forward(
-            self,
-            # for tgt
-            tgt: Optional[paddle.Tensor],  # nq, bs, d_model
-            tgt_query_pos: Optional[
-                paddle.Tensor]=None,  # pos for query. MLP(Sine(pos))
-            tgt_query_sine_embed: Optional[
-                paddle.Tensor]=None,  # pos for query. Sine(pos)
-            tgt_key_padding_mask: Optional[paddle.Tensor]=None,
-            tgt_reference_points: Optional[paddle.Tensor]=None,  # nq, bs, 4
-            memory_text: Optional[paddle.Tensor]=None,  # bs, num_token, d_model
-            text_attention_mask: Optional[paddle.Tensor]=None,  # bs, num_token
-            # for memory
-            memory: Optional[paddle.Tensor]=None,  # hw, bs, d_model
-            memory_key_padding_mask: Optional[paddle.Tensor]=None,
-            memory_level_start_index: Optional[
-                paddle.Tensor]=None,  # num_levels
-            memory_spatial_shapes: Optional[
-                paddle.Tensor]=None,  # bs, num_levels, 2
-            memory_pos: Optional[paddle.Tensor]=None,  # pos for memory
-            # sa
-            self_attn_mask: Optional[
-                paddle.Tensor]=None,  # mask used for self-attention
-            cross_attn_mask: Optional[
-                paddle.Tensor]=None,  # mask used for cross-attention
+        self,
+        # for tgt
+        tgt: Optional[paddle.Tensor],  # nq, bs, d_model
+        tgt_query_pos: Optional[paddle.Tensor] = None,  # pos for query. MLP(Sine(pos))
+        tgt_query_sine_embed: Optional[paddle.Tensor] = None,  # pos for query. Sine(pos)
+        tgt_key_padding_mask: Optional[paddle.Tensor] = None,
+        tgt_reference_points: Optional[paddle.Tensor] = None,  # nq, bs, 4
+        memory_text: Optional[paddle.Tensor] = None,  # bs, num_token, d_model
+        text_attention_mask: Optional[paddle.Tensor] = None,  # bs, num_token
+        # for memory
+        memory: Optional[paddle.Tensor] = None,  # hw, bs, d_model
+        memory_key_padding_mask: Optional[paddle.Tensor] = None,
+        memory_level_start_index: Optional[paddle.Tensor] = None,  # num_levels
+        memory_spatial_shapes: Optional[paddle.Tensor] = None,  # bs, num_levels, 2
+        memory_pos: Optional[paddle.Tensor] = None,  # pos for memory
+        # sa
+        self_attn_mask: Optional[paddle.Tensor] = None,  # mask used for self-attention
+        cross_attn_mask: Optional[paddle.Tensor] = None,  # mask used for cross-attention
     ):
         """
         Input:
@@ -904,8 +888,8 @@ def forward(
                 q,
                 k,
                 tgt,
-                attn_mask=self_attn_mask
-                if self_attn_mask is None else ~self_attn_mask, )[0]
+                attn_mask=self_attn_mask if self_attn_mask is None else ~self_attn_mask,
+            )[0]
             tgt = tgt + self.dropout2(tgt2)
             tgt = self.norm2(tgt)
 
@@ -914,7 +898,8 @@ def forward(
                 self.with_pos_embed(tgt, tgt_query_pos),
                 memory_text,
                 memory_text,
-                attn_mask=~text_attention_mask, )[0]
+                attn_mask=~text_attention_mask,
+            )[0]
             tgt = tgt + self.catext_dropout(tgt2)
             tgt = self.catext_norm(tgt)
 
@@ -924,7 +909,8 @@ def forward(
             value=memory,
             value_spatial_shapes=memory_spatial_shapes,
             value_level_start_index=memory_level_start_index,
-            value_mask=memory_key_padding_mask, )
+            value_mask=memory_key_padding_mask,
+        )
         tgt = tgt + self.dropout1(tgt2)
         tgt = self.norm1(tgt)
 
@@ -962,4 +948,5 @@ def build_transformer(args):
         use_text_cross_attention=args.use_text_cross_attention,
         text_dropout=args.text_dropout,
         fusion_dropout=args.fusion_dropout,
-        fusion_droppath=args.fusion_droppath, )
+        fusion_droppath=args.fusion_droppath,
+    )
diff --git a/paddlemix/models/groundingdino/transformer_vanilla.py b/paddlemix/models/groundingdino/transformer_vanilla.py
index 858dc6cd87395..7dacba17b74a7 100644
--- a/paddlemix/models/groundingdino/transformer_vanilla.py
+++ b/paddlemix/models/groundingdino/transformer_vanilla.py
@@ -15,22 +15,14 @@
 from typing import Optional
 
 import paddle
-import paddle.nn.functional as F
 from paddle import Tensor, nn
 
 from .layers import MultiHeadAttention
-from .utils import (MLP, _get_activation_fn, _get_clones,
-                    gen_encoder_output_proposals, gen_sineembed_for_position,
-                    sigmoid_focal_loss)
+from .utils import _get_activation_fn, _get_clones
 
 
 class TextTransformer(nn.Layer):
-    def __init__(self,
-                 num_layers,
-                 d_model=256,
-                 nheads=8,
-                 dim_feedforward=2048,
-                 dropout=0.1):
+    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
         super().__init__()
         self.num_layers = num_layers
         self.d_model = d_model
@@ -42,12 +34,11 @@ def __init__(self,
             d_model=d_model,
             nhead=nheads,
             dim_feedforward=dim_feedforward,
-            dropout=dropout, )
+            dropout=dropout,
+        )
         self.layers = _get_clones(single_encoder_layer, num_layers)
 
-    def forward(self,
-                memory_text: paddle.Tensor,
-                text_attention_mask: paddle.Tensor):
+    def forward(self, memory_text: paddle.Tensor, text_attention_mask: paddle.Tensor):
         """
 
         Args:
@@ -74,13 +65,14 @@ def forward(self,
 
 class TransformerEncoderLayer(nn.Layer):
     def __init__(
-            self,
-            d_model,
-            nhead,
-            dim_feedforward=2048,
-            dropout=0.1,
-            activation="relu",
-            normalize_before=False, ):
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
         super().__init__()
         self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout)
         # Implementation of Feedforward model
@@ -101,11 +93,12 @@ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
         return tensor if pos is None else tensor + pos
 
     def forward(
-            self,
-            src,
-            src_mask: Optional[Tensor]=None,
-            src_key_padding_mask: Optional[Tensor]=None,
-            pos: Optional[Tensor]=None, ):
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
         # repeat attn mask
         if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
             # bs, num_q, num_k
diff --git a/paddlemix/models/groundingdino/utils.py b/paddlemix/models/groundingdino/utils.py
index c2c4115e02561..42984c8ea91c0 100644
--- a/paddlemix/models/groundingdino/utils.py
+++ b/paddlemix/models/groundingdino/utils.py
@@ -41,10 +41,11 @@ def _get_clones(module, N, layer_share=False):
 
 
 def get_sine_pos_embed(
-        pos_tensor: paddle.Tensor,
-        num_pos_feats: int=128,
-        temperature: int=10000,
-        exchange_xy: bool=True, ):
+    pos_tensor: paddle.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+):
     """generate sine position embedding from a position tensor
     Args:
         pos_tensor (paddle.Tensor): shape: [..., n].
@@ -57,20 +58,14 @@ def get_sine_pos_embed(
     """
     scale = 2 * math.pi
     dim_t = paddle.arange(num_pos_feats)
-    dim_t = temperature**(
-        2.0 * paddle.floor_divide(dim_t, paddle.to_tensor(2)) / num_pos_feats)
+    dim_t = temperature ** (2.0 * paddle.floor_divide(dim_t, paddle.to_tensor(2)) / num_pos_feats)
 
     def sine_func(x: paddle.Tensor):
         sin_x = x * scale / dim_t
-        sin_x = paddle.stack(
-            (sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), axis=3).flatten(2)
+        sin_x = paddle.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), axis=3).flatten(2)
         return sin_x
 
-    pos_res = [
-        sine_func(x)
-        for x in pos_tensor.split(
-            [1] * pos_tensor.shape[-1], axis=-1)
-    ]
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], axis=-1)]
     if exchange_xy:
         pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
     pos_res = paddle.concat(pos_res, axis=-1)
@@ -78,10 +73,11 @@ def sine_func(x: paddle.Tensor):
 
 
 def gen_encoder_output_proposals(
-        memory: paddle.Tensor,
-        memory_padding_mask: paddle.Tensor,
-        spatial_shapes: paddle.Tensor,
-        learnedwh=None, ):
+    memory: paddle.Tensor,
+    memory_padding_mask: paddle.Tensor,
+    spatial_shapes: paddle.Tensor,
+    learnedwh=None,
+):
     """
     Input:
         - memory: bs, \sum{hw}, d_model
@@ -96,23 +92,19 @@ def gen_encoder_output_proposals(
     proposals = []
     _cur = 0
     for lvl, (H_, W_) in enumerate(spatial_shapes):
-        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].reshape(
-            [N_, H_, W_, 1])
+        mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].reshape([N_, H_, W_, 1])
         valid_H = paddle.sum(~mask_flatten_[:, :, 0, 0], 1)
         valid_W = paddle.sum(~mask_flatten_[:, 0, :, 0], 1)
 
         # import ipdb; ipdb.set_trace()
 
         grid_y, grid_x = paddle.meshgrid(
-            paddle.linspace(
-                0, H_ - 1, H_, dtype=paddle.float32),
-            paddle.linspace(
-                0, W_ - 1, W_, dtype=paddle.float32), )
-        grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)],
-                             -1)  # H_, W_, 2
-
-        scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)],
-                              1).reshape([N_, 1, 1, 2])
+            paddle.linspace(0, H_ - 1, H_, dtype=paddle.float32),
+            paddle.linspace(0, W_ - 1, W_, dtype=paddle.float32),
+        )
+        grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+
+        scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).reshape([N_, 1, 1, 2])
         grid = (grid.unsqueeze(0).tile([N_, 1, 1, 1]) + 0.5) / scale
 
         if learnedwh is not None:
@@ -126,33 +118,21 @@ def gen_encoder_output_proposals(
         _cur += H_ * W_
 
     output_proposals = paddle.concat(proposals, 1)
-    output_proposals_valid = ((output_proposals > 0.01) &
-                              (output_proposals < 0.99)).all(-1, keepdim=True)
-    output_proposals = paddle.log(output_proposals /
-                                  (1 - output_proposals))  # unsigmoid
-    output_proposals = masked_fill(output_proposals,
-                                   memory_padding_mask.unsqueeze(-1),
-                                   float("inf"))
-    output_proposals = masked_fill(output_proposals, ~output_proposals_valid,
-                                   float("inf"))
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = paddle.log(output_proposals / (1 - output_proposals))  # unsigmoid
+    output_proposals = masked_fill(output_proposals, memory_padding_mask.unsqueeze(-1), float("inf"))
+    output_proposals = masked_fill(output_proposals, ~output_proposals_valid, float("inf"))
 
     output_memory = memory
-    output_memory = masked_fill(output_memory,
-                                memory_padding_mask.unsqueeze(-1), float(0))
-    output_memory = masked_fill(output_memory, ~output_proposals_valid,
-                                float(0))
+    output_memory = masked_fill(output_memory, memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0))
 
     return output_memory, output_proposals
 
 
 class RandomBoxPerturber:
-    def __init__(self,
-                 x_noise_scale=0.2,
-                 y_noise_scale=0.2,
-                 w_noise_scale=0.2,
-                 h_noise_scale=0.2) -> None:
-        self.noise_scale = paddle.to_tensor(
-            [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
+    def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
+        self.noise_scale = paddle.to_tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
 
     def __call__(self, refanchors: paddle.Tensor) -> paddle.Tensor:
         nq, bs, query_dim = refanchors.shape
@@ -165,12 +145,13 @@ def __call__(self, refanchors: paddle.Tensor) -> paddle.Tensor:
 
 
 def sigmoid_focal_loss(
-        inputs,
-        targets,
-        num_boxes,
-        alpha: float=0.25,
-        gamma: float=2,
-        no_reduction=False, ):
+    inputs,
+    targets,
+    num_boxes,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    no_reduction=False,
+):
     """
     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
     Args:
@@ -187,10 +168,9 @@ def sigmoid_focal_loss(
         Loss tensor
     """
     prob = inputs.sigmoid()
-    ce_loss = F.binary_cross_entropy_with_logits(
-        inputs, targets, reduction="none")
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
     p_t = prob * targets + (1 - prob) * (1 - targets)
-    loss = ce_loss * ((1 - p_t)**gamma)
+    loss = ce_loss * ((1 - p_t) ** gamma)
 
     if alpha >= 0:
         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
@@ -209,8 +189,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
         self.num_layers = num_layers
         h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.LayerList(
-            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.layers = nn.LayerList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 
     def forward(self, x):
         for i, layer in enumerate(self.layers):
@@ -238,34 +217,27 @@ def gen_sineembed_for_position(pos_tensor):
 
     scale = 2 * math.pi
     dim_t = paddle.arange(128)
-    dim_t = 10000**(2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128)
+    dim_t = 10000 ** (2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128)
     x_embed = pos_tensor[:, :, 0] * scale
     y_embed = pos_tensor[:, :, 1] * scale
     pos_x = x_embed[:, :, None] / dim_t
     pos_y = y_embed[:, :, None] / dim_t
-    pos_x = paddle.stack(
-        (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), axis=3).flatten(2)
-    pos_y = paddle.stack(
-        (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), axis=3).flatten(2)
+    pos_x = paddle.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), axis=3).flatten(2)
+    pos_y = paddle.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), axis=3).flatten(2)
     if pos_tensor.shape[-1] == 2:
         pos = paddle.concat((pos_y, pos_x), aixs=2)
     elif pos_tensor.shape[-1] == 4:
         w_embed = pos_tensor[:, :, 2] * scale
         pos_w = w_embed[:, :, None] / dim_t
-        pos_w = paddle.stack(
-            (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()),
-            axis=3).flatten(2)
+        pos_w = paddle.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), axis=3).flatten(2)
 
         h_embed = pos_tensor[:, :, 3] * scale
         pos_h = h_embed[:, :, None] / dim_t
-        pos_h = paddle.stack(
-            (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()),
-            axis=3).flatten(2)
+        pos_h = paddle.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), axis=3).flatten(2)
 
         pos = paddle.concat((pos_y, pos_x, pos_w, pos_h), axis=2)
     else:
-        raise ValueError("Unknown pos_tensor shape(-1):{}".format(
-            pos_tensor.shape[-1]))
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.shape[-1]))
     return pos
 
 
@@ -297,12 +269,11 @@ def forward(self, x, text_dict):
         y = text_dict["encoded_text"]
         text_token_mask = text_dict["text_token_mask"]
 
-        res = x @y.transpose([0, 2, 1])
+        res = x @ y.transpose([0, 2, 1])
         masked_fill(res, ~text_token_mask[:, None, :], float("-inf"))
 
         # padding to max_text_len
-        new_res = paddle.full((*res.shape[:-1], self.max_text_len),
-                              float("-inf"))
-        new_res[..., :res.shape[-1]] = res
+        new_res = paddle.full((*res.shape[:-1], self.max_text_len), float("-inf"))
+        new_res[..., : res.shape[-1]] = res
 
         return new_res
diff --git a/paddlemix/models/imagebind/configuration.py b/paddlemix/models/imagebind/configuration.py
index 2ce6a32933571..04bd63b46cc2a 100644
--- a/paddlemix/models/imagebind/configuration.py
+++ b/paddlemix/models/imagebind/configuration.py
@@ -16,8 +16,7 @@
 import os
 from typing import Union
 
-from paddlenlp.transformers.clip.configuration import (CLIPTextConfig,
-                                                       CLIPVisionConfig)
+from paddlenlp.transformers.clip.configuration import CLIPTextConfig, CLIPVisionConfig
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 
 from paddlemix.utils.log import logger
@@ -35,28 +34,23 @@ class ImageBindVisionConfig(CLIPVisionConfig):
     model_type = "imagebind_vision_model"
 
     def __init__(
-            self,
-            **kwargs, ):
-        kwargs["return_dict"] = kwargs.pop("return_dict", True)
-        super().__init__(**kwargs)
-
-    def __init__(
-            self,
-            hidden_size=768,
-            intermediate_size=3072,
-            projection_dim=512,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            num_channels=3,
-            image_size=224,
-            patch_size=32,
-            hidden_act="quick_gelu",
-            layer_norm_eps=0.00001,
-            dropout=0.0,
-            attention_dropout=0.0,
-            initializer_range=0.02,
-            initializer_factor=1.0,
-            **kwargs, ):
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -81,29 +75,31 @@ class ImageBindTextConfig(CLIPTextConfig):
     model_type = "imagebind_text_model"
 
     def __init__(
-            self,
-            vocab_size=49408,
-            hidden_size=512,
-            intermediate_size=2048,
-            projection_dim=512,
-            num_hidden_layers=12,
-            num_attention_heads=8,
-            max_position_embeddings=77,
-            hidden_act="quick_gelu",
-            layer_norm_eps=0.00001,
-            dropout=0.0,
-            attention_dropout=0.0,
-            initializer_range=0.02,
-            initializer_factor=1.0,
-            pad_token_id=1,
-            bos_token_id=0,
-            eos_token_id=2,
-            **kwargs, ):
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
-            **kwargs, )
+            **kwargs,
+        )
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
@@ -125,24 +121,21 @@ class ImageBindAudioConfig(PretrainedConfig):
     model_type = "imagebind_audio_model"
 
     def __init__(
-            self,
-            **kwargs, ):
+        self,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "imagebind":
             config_dict = config_dict["audio_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -156,24 +149,21 @@ class ImageBindDepthConfig(PretrainedConfig):
     model_type = "imagebind_depth_model"
 
     def __init__(
-            self,
-            **kwargs, ):
+        self,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "imagebind":
             config_dict = config_dict["depth_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -187,24 +177,21 @@ class ImageBindThermalConfig(PretrainedConfig):
     model_type = "imagebind_thermal_model"
 
     def __init__(
-            self,
-            **kwargs, ):
+        self,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "imagebind":
             config_dict = config_dict["thermal_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -218,24 +205,21 @@ class ImageBindIMUConfig(PretrainedConfig):
     model_type = "imagebind_imu_model"
 
     def __init__(
-            self,
-            **kwargs, ):
+        self,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "imagebind":
             config_dict = config_dict["imu_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -250,16 +234,17 @@ class ImageBindConfig(PretrainedConfig):
     is_composition = True
 
     def __init__(
-            self,
-            text_config=None,
-            vision_config=None,
-            audio_config=None,
-            depth_config=None,
-            thermal_config=None,
-            imu_config=None,
-            projection_dim=512,
-            logit_scale_init_value=2.6592,
-            **kwargs, ):
+        self,
+        text_config=None,
+        vision_config=None,
+        audio_config=None,
+        depth_config=None,
+        thermal_config=None,
+        imu_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -285,39 +270,27 @@ def __init__(
 
         if text_config is None:
             text_config = {}
-            logger.info(
-                "text_config is None. Initializing the ImageBindTextConfig with default values."
-            )
+            logger.info("text_config is None. Initializing the ImageBindTextConfig with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info(
-                "vision_config is None. initializing the ImageBindVisionConfig with default values."
-            )
+            logger.info("vision_config is None. initializing the ImageBindVisionConfig with default values.")
 
         if audio_config is None:
             audio_config = {}
-            logger.info(
-                "audio_config is None. initializing the ImageBindAudioConfig with default values."
-            )
+            logger.info("audio_config is None. initializing the ImageBindAudioConfig with default values.")
 
         if depth_config is None:
             depth_config = {}
-            logger.info(
-                "depth_config is None. initializing the ImageBindDepthConfig with default values."
-            )
+            logger.info("depth_config is None. initializing the ImageBindDepthConfig with default values.")
 
         if thermal_config is None:
             thermal_config = {}
-            logger.info(
-                "thermal_config is None. initializing the ImageBindThermalConfig with default values."
-            )
+            logger.info("thermal_config is None. initializing the ImageBindThermalConfig with default values.")
 
         if imu_config is None:
             imu_config = {}
-            logger.info(
-                "imu_config is None. initializing the ImageBindIMUConfig with default values."
-            )
+            logger.info("imu_config is None. initializing the ImageBindIMUConfig with default values.")
 
         # text_config["projection_dim"] = projection_dim
         # vision_config["projection_dim"] = projection_dim
@@ -334,10 +307,11 @@ def __init__(
 
     @classmethod
     def from_text_vision_configs(
-            cls,
-            text_config: ImageBindTextConfig,
-            vision_config: ImageBindVisionConfig,
-            **kwargs, ):
+        cls,
+        text_config: ImageBindTextConfig,
+        vision_config: ImageBindVisionConfig,
+        **kwargs,
+    ):
         r"""
         Instantiate a [`ImageBindConfig`] (or a derived class) from clip text model configuration and clip vision model
         configuration.
@@ -349,7 +323,8 @@ def from_text_vision_configs(
         return cls(
             text_config=text_config.to_dict(),
             vision_config=vision_config.to_dict(),
-            **kwargs, )
+            **kwargs,
+        )
 
     def to_dict(self):
         """
diff --git a/paddlemix/models/imagebind/helpers.py b/paddlemix/models/imagebind/helpers.py
index 228a17265b9f6..f6f3a4efef632 100644
--- a/paddlemix/models/imagebind/helpers.py
+++ b/paddlemix/models/imagebind/helpers.py
@@ -24,7 +24,8 @@
     AUDIO="audio",
     THERMAL="thermal",
     DEPTH="depth",
-    IMU="imu", )
+    IMU="imu",
+)
 
 
 class Normalize(paddle.nn.Layer):
@@ -38,10 +39,11 @@ def forward(self, x):
 
 class LearnableLogitScaling(paddle.nn.Layer):
     def __init__(
-            self,
-            logit_scale_init: float=1 / 0.07,
-            learnable: bool=True,
-            max_logit_scale: float=100, ) -> None:
+        self,
+        logit_scale_init: float = 1 / 0.07,
+        learnable: bool = True,
+        max_logit_scale: float = 100,
+    ) -> None:
         super().__init__()
         self.max_logit_scale = max_logit_scale
         self.logit_scale_init = logit_scale_init
@@ -49,18 +51,15 @@ def __init__(
         log_logit_scale = paddle.ones(shape=[]) * np.log(self.logit_scale_init)
         if learnable:
             self.log_logit_scale = paddle.create_parameter(
-                shape=log_logit_scale.shape
-                if log_logit_scale.dim() != 0 else [1],
+                shape=log_logit_scale.shape if log_logit_scale.dim() != 0 else [1],
                 dtype=log_logit_scale.dtype,
-                default_initializer=paddle.nn.initializer.Assign(
-                    value=log_logit_scale), )
+                default_initializer=paddle.nn.initializer.Assign(value=log_logit_scale),
+            )
         else:
             self.register_buffer("log_logit_scale", log_logit_scale)
 
     def forward(self, x):
-        return (paddle.clip(
-            x=self.log_logit_scale.exp(),
-            max=self.max_logit_scale).unsqueeze(0) * x)
+        return paddle.clip(x=self.log_logit_scale.exp(), max=self.max_logit_scale).unsqueeze(0) * x
 
 
 class EinOpsRearrange(paddle.nn.Layer):
@@ -81,14 +80,20 @@ class VerboseNNModule(paddle.nn.Layer):
 
     @staticmethod
     def get_readable_tensor_repr(name: str, tensor: paddle.Tensor) -> str:
-        st = ("(" + name + "): " + "tensor(" + str(tuple(tensor[1].shape)) +
-              ", requires_grad=" + str(not tensor[1].stop_gradient) + ")\n")
+        st = (
+            "("
+            + name
+            + "): "
+            + "tensor("
+            + str(tuple(tensor[1].shape))
+            + ", requires_grad="
+            + str(not tensor[1].stop_gradient)
+            + ")\n"
+        )
         return st
 
 
-def cast_if_src_dtype(tensor: paddle.Tensor,
-                      src_dtype: paddle.dtype,
-                      tgt_dtype: paddle.dtype):
+def cast_if_src_dtype(tensor: paddle.Tensor, src_dtype: paddle.dtype, tgt_dtype: paddle.dtype):
     updated = False
     if tensor.dtype == src_dtype:
         tensor = tensor.cast(tgt_dtype)
diff --git a/paddlemix/models/imagebind/modeling.py b/paddlemix/models/imagebind/modeling.py
index e3f85a9a2016a..6b7104c8f9ae5 100644
--- a/paddlemix/models/imagebind/modeling.py
+++ b/paddlemix/models/imagebind/modeling.py
@@ -14,23 +14,27 @@
 
 from functools import partial
 from types import SimpleNamespace
-from typing import Any, Dict, List, Tuple
 
-import numpy as np
 import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddlenlp.transformers.model_utils import (PretrainedModel,
-                                                register_base_model)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
 
 from .configuration import ImageBindConfig
-from .helpers import (EinOpsRearrange, LearnableLogitScaling, Normalize,
-                      SelectElement, SelectEOSAndProject, VerboseNNModule,
-                      cast_if_src_dtype)
+from .helpers import (
+    LearnableLogitScaling,
+    Normalize,
+    SelectElement,
+    SelectEOSAndProject,
+)
 from .multimodal_preprocessors import (
-    AudioPreprocessor, IMUPreprocessor, PadIm2Video, PatchEmbedGeneric,
-    RGBDTPreprocessor, SpatioTemporalPosEmbeddingHelper, TextPreprocessor,
-    ThermalPreprocessor)
+    AudioPreprocessor,
+    IMUPreprocessor,
+    PadIm2Video,
+    PatchEmbedGeneric,
+    RGBDTPreprocessor,
+    SpatioTemporalPosEmbeddingHelper,
+    TextPreprocessor,
+    ThermalPreprocessor,
+)
 from .transformer import MultiheadAttention, SimpleTransformer
 
 ModalityType = SimpleNamespace(
@@ -39,7 +43,8 @@
     AUDIO="audio",
     THERMAL="thermal",
     DEPTH="depth",
-    IMU="imu", )
+    IMU="imu",
+)
 
 __all__ = [
     "ImageBindModel",
@@ -85,8 +90,8 @@ def __init__(self, config: ImageBindConfig):
         text_embed_dim = config.text_config.text_embed_dim
         text_num_blocks = config.text_config.text_num_blocks
         text_num_heads = config.text_config.text_num_heads
-        context_length = config.text_config.context_length
-        vocab_size = config.text_config.vocab_size
+        # context_length = config.text_config.context_length
+        # vocab_size = config.text_config.vocab_size
 
         # depth_config
         depth_embed_dim = config.depth_config.depth_embed_dim
@@ -104,7 +109,7 @@ def __init__(self, config: ImageBindConfig):
 
         # imu_config
         imu_embed_dim = config.imu_config.imu_embed_dim
-        imu_kernel_size = config.imu_config.imu_kernel_size
+        # imu_kernel_size = config.imu_config.imu_kernel_size
         imu_num_blocks = config.imu_config.imu_num_blocks
         imu_num_heads = config.imu_config.imu_num_heads
         imu_drop_path = config.imu_config.imu_drop_path
@@ -123,7 +128,8 @@ def __init__(self, config: ImageBindConfig):
             depth_kernel_size,
             thermal_embed_dim,
             thermal_kernel_size,
-            imu_embed_dim, )
+            imu_embed_dim,
+        )
         self.modality_trunks = self._create_modality_trunks(
             vision_embed_dim,
             vision_num_blocks,
@@ -146,7 +152,8 @@ def __init__(self, config: ImageBindConfig):
             imu_embed_dim,
             imu_num_blocks,
             imu_num_heads,
-            imu_drop_path, )
+            imu_drop_path,
+        )
         self.modality_heads = self._create_modality_heads(
             out_embed_dim,
             vision_embed_dim,
@@ -154,48 +161,52 @@ def __init__(self, config: ImageBindConfig):
             audio_embed_dim,
             depth_embed_dim,
             thermal_embed_dim,
-            imu_embed_dim, )
-        self.modality_postprocessors = self._create_modality_postprocessors(
-            out_embed_dim)
+            imu_embed_dim,
+        )
+        self.modality_postprocessors = self._create_modality_postprocessors(out_embed_dim)
 
     def _create_modality_preprocessors(
-            self,
-            video_frames,
-            vision_embed_dim,
-            kernel_size,
-            text_embed_dim,
-            audio_embed_dim,
-            audio_kernel_size,
-            audio_stride,
-            audio_num_mel_bins,
-            audio_target_len,
-            depth_embed_dim,
-            depth_kernel_size,
-            thermal_embed_dim,
-            thermal_kernel_size,
-            imu_embed_dim, ):
-        rgbt_stem = PatchEmbedGeneric(proj_stem=[
-            PadIm2Video(
-                pad_type="repeat", ntimes=2),
-            paddle.nn.Conv3D(
-                in_channels=3,
-                kernel_size=kernel_size,
-                out_channels=vision_embed_dim,
-                stride=kernel_size,
-                bias_attr=False, ),
-        ])
+        self,
+        video_frames,
+        vision_embed_dim,
+        kernel_size,
+        text_embed_dim,
+        audio_embed_dim,
+        audio_kernel_size,
+        audio_stride,
+        audio_num_mel_bins,
+        audio_target_len,
+        depth_embed_dim,
+        depth_kernel_size,
+        thermal_embed_dim,
+        thermal_kernel_size,
+        imu_embed_dim,
+    ):
+        rgbt_stem = PatchEmbedGeneric(
+            proj_stem=[
+                PadIm2Video(pad_type="repeat", ntimes=2),
+                paddle.nn.Conv3D(
+                    in_channels=3,
+                    kernel_size=kernel_size,
+                    out_channels=vision_embed_dim,
+                    stride=kernel_size,
+                    bias_attr=False,
+                ),
+            ]
+        )
         rgbt_preprocessor = RGBDTPreprocessor(
             img_size=[3, video_frames, 224, 224],
             num_cls_tokens=1,
-            pos_embed_fn=partial(
-                SpatioTemporalPosEmbeddingHelper, learnable=True),
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
             rgbt_stem=rgbt_stem,
-            depth_stem=None, )
+            depth_stem=None,
+        )
         text_preprocessor = TextPreprocessor(
             context_length=77,
             vocab_size=49408,
             embed_dim=text_embed_dim,
-            causal_masking=True, )
+            causal_masking=True,
+        )
         audio_stem = PatchEmbedGeneric(
             proj_stem=[
                 paddle.nn.Conv2D(
@@ -203,19 +214,22 @@ def _create_modality_preprocessors(
                     kernel_size=audio_kernel_size,
                     stride=audio_stride,
                     out_channels=audio_embed_dim,
-                    bias_attr=False, )
+                    bias_attr=False,
+                )
             ],
             norm_layer=paddle.nn.LayerNorm(
                 normalized_shape=audio_embed_dim,
                 epsilon=1e-05,
                 weight_attr=None,
-                bias_attr=None, ), )
+                bias_attr=None,
+            ),
+        )
         audio_preprocessor = AudioPreprocessor(
             img_size=[1, audio_num_mel_bins, audio_target_len],
             num_cls_tokens=1,
-            pos_embed_fn=partial(
-                SpatioTemporalPosEmbeddingHelper, learnable=True),
-            audio_stem=audio_stem, )
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            audio_stem=audio_stem,
+        )
         depth_stem = PatchEmbedGeneric(
             [
                 paddle.nn.Conv2D(
@@ -223,20 +237,23 @@ def _create_modality_preprocessors(
                     in_channels=1,
                     out_channels=depth_embed_dim,
                     stride=depth_kernel_size,
-                    bias_attr=False, )
+                    bias_attr=False,
+                )
             ],
             norm_layer=paddle.nn.LayerNorm(
                 normalized_shape=depth_embed_dim,
                 epsilon=1e-05,
                 weight_attr=None,
-                bias_attr=None, ), )
+                bias_attr=None,
+            ),
+        )
         depth_preprocessor = RGBDTPreprocessor(
             img_size=[1, 224, 224],
             num_cls_tokens=1,
-            pos_embed_fn=partial(
-                SpatioTemporalPosEmbeddingHelper, learnable=True),
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
             rgbt_stem=None,
-            depth_stem=depth_stem, )
+            depth_stem=depth_stem,
+        )
         thermal_stem = PatchEmbedGeneric(
             [
                 paddle.nn.Conv2D(
@@ -244,37 +261,39 @@ def _create_modality_preprocessors(
                     in_channels=1,
                     out_channels=thermal_embed_dim,
                     stride=thermal_kernel_size,
-                    bias_attr=False, )
+                    bias_attr=False,
+                )
             ],
             norm_layer=paddle.nn.LayerNorm(
                 normalized_shape=thermal_embed_dim,
                 epsilon=1e-05,
                 weight_attr=None,
-                bias_attr=None, ), )
+                bias_attr=None,
+            ),
+        )
         thermal_preprocessor = ThermalPreprocessor(
             img_size=[1, 224, 224],
             num_cls_tokens=1,
-            pos_embed_fn=partial(
-                SpatioTemporalPosEmbeddingHelper, learnable=True),
-            thermal_stem=thermal_stem, )
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            thermal_stem=thermal_stem,
+        )
         imu_stem = PatchEmbedGeneric(
-            [
-                paddle.nn.Linear(
-                    in_features=48, out_features=imu_embed_dim, bias_attr=False)
-            ],
+            [paddle.nn.Linear(in_features=48, out_features=imu_embed_dim, bias_attr=False)],
             norm_layer=paddle.nn.LayerNorm(
                 normalized_shape=imu_embed_dim,
                 epsilon=1e-05,
                 weight_attr=None,
-                bias_attr=None, ), )
+                bias_attr=None,
+            ),
+        )
         imu_preprocessor = IMUPreprocessor(
             img_size=[6, 2000],
             num_cls_tokens=1,
             kernel_size=8,
             embed_dim=imu_embed_dim,
-            pos_embed_fn=partial(
-                SpatioTemporalPosEmbeddingHelper, learnable=True),
-            imu_stem=imu_stem, )
+            pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
+            imu_stem=imu_stem,
+        )
         modality_preprocessors = {
             ModalityType.VISION: rgbt_preprocessor,
             ModalityType.TEXT: text_preprocessor,
@@ -286,31 +305,31 @@ def _create_modality_preprocessors(
         return paddle.nn.LayerDict(sublayers=modality_preprocessors)
 
     def _create_modality_trunks(
-            self,
-            vision_embed_dim,
-            vision_num_blocks,
-            vision_num_heads,
-            text_embed_dim,
-            text_num_blocks,
-            text_num_heads,
-            audio_embed_dim,
-            audio_num_blocks,
-            audio_num_heads,
-            audio_drop_path,
-            depth_embed_dim,
-            depth_num_blocks,
-            depth_num_heads,
-            depth_drop_path,
-            thermal_embed_dim,
-            thermal_num_blocks,
-            thermal_num_heads,
-            thermal_drop_path,
-            imu_embed_dim,
-            imu_num_blocks,
-            imu_num_heads,
-            imu_drop_path, ):
-        def instantiate_trunk(embed_dim, num_blocks, num_heads,
-                              pre_transformer_ln, add_bias_kv, drop_path):
+        self,
+        vision_embed_dim,
+        vision_num_blocks,
+        vision_num_heads,
+        text_embed_dim,
+        text_num_blocks,
+        text_num_heads,
+        audio_embed_dim,
+        audio_num_blocks,
+        audio_num_heads,
+        audio_drop_path,
+        depth_embed_dim,
+        depth_num_blocks,
+        depth_num_heads,
+        depth_drop_path,
+        thermal_embed_dim,
+        thermal_num_blocks,
+        thermal_num_heads,
+        thermal_drop_path,
+        imu_embed_dim,
+        imu_num_blocks,
+        imu_num_heads,
+        imu_drop_path,
+    ):
+        def instantiate_trunk(embed_dim, num_blocks, num_heads, pre_transformer_ln, add_bias_kv, drop_path):
             return SimpleTransformer(
                 embed_dim=embed_dim,
                 num_blocks=num_blocks,
@@ -321,14 +340,17 @@ def instantiate_trunk(embed_dim, num_blocks, num_heads,
                     add_bias_kv=add_bias_kv,
                     embed_dim=embed_dim,
                     num_heads=num_heads,
-                    bias_attr=True, ),
+                    bias_attr=True,
+                ),
                 pre_transformer_layer=paddle.nn.Sequential(
                     paddle.nn.LayerNorm(
                         normalized_shape=embed_dim,
                         epsilon=1e-06,
                         weight_attr=None,
-                        bias_attr=None, )
-                    if pre_transformer_ln else paddle.nn.Identity(),
+                        bias_attr=None,
+                    )
+                    if pre_transformer_ln
+                    else paddle.nn.Identity(),
                     # EinOpsRearrange('b l d -> l b d')
                 ),
                 # post_transformer_layer=EinOpsRearrange('l b d -> b l d')
@@ -341,144 +363,159 @@ def instantiate_trunk(embed_dim, num_blocks, num_heads,
             vision_num_heads,
             pre_transformer_ln=True,
             add_bias_kv=False,
-            drop_path=0.0, )
+            drop_path=0.0,
+        )
         modality_trunks[ModalityType.TEXT] = instantiate_trunk(
             text_embed_dim,
             text_num_blocks,
             text_num_heads,
             pre_transformer_ln=False,
             add_bias_kv=False,
-            drop_path=0.0, )
+            drop_path=0.0,
+        )
         modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
             audio_embed_dim,
             audio_num_blocks,
             audio_num_heads,
             pre_transformer_ln=False,
             add_bias_kv=True,
-            drop_path=audio_drop_path, )
+            drop_path=audio_drop_path,
+        )
         modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
             depth_embed_dim,
             depth_num_blocks,
             depth_num_heads,
             pre_transformer_ln=False,
             add_bias_kv=True,
-            drop_path=depth_drop_path, )
+            drop_path=depth_drop_path,
+        )
         modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
             thermal_embed_dim,
             thermal_num_blocks,
             thermal_num_heads,
             pre_transformer_ln=False,
             add_bias_kv=True,
-            drop_path=thermal_drop_path, )
+            drop_path=thermal_drop_path,
+        )
         modality_trunks[ModalityType.IMU] = instantiate_trunk(
             imu_embed_dim,
             imu_num_blocks,
             imu_num_heads,
             pre_transformer_ln=False,
             add_bias_kv=True,
-            drop_path=imu_drop_path, )
+            drop_path=imu_drop_path,
+        )
         return paddle.nn.LayerDict(sublayers=modality_trunks)
 
     def _create_modality_heads(
-            self,
-            out_embed_dim,
-            vision_embed_dim,
-            text_embed_dim,
-            audio_embed_dim,
-            depth_embed_dim,
-            thermal_embed_dim,
-            imu_embed_dim, ):
+        self,
+        out_embed_dim,
+        vision_embed_dim,
+        text_embed_dim,
+        audio_embed_dim,
+        depth_embed_dim,
+        thermal_embed_dim,
+        imu_embed_dim,
+    ):
         modality_heads = {}
         modality_heads[ModalityType.VISION] = paddle.nn.Sequential(
             paddle.nn.LayerNorm(
                 normalized_shape=vision_embed_dim,
                 epsilon=1e-06,
                 weight_attr=None,
-                bias_attr=None, ),
+                bias_attr=None,
+            ),
             SelectElement(index=0),
             paddle.nn.Linear(
                 in_features=vision_embed_dim,
                 out_features=out_embed_dim,
-                bias_attr=False, ), )
+                bias_attr=False,
+            ),
+        )
         modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
             proj=paddle.nn.Sequential(
                 paddle.nn.LayerNorm(
                     normalized_shape=text_embed_dim,
                     epsilon=1e-06,
                     weight_attr=None,
-                    bias_attr=None, ),
+                    bias_attr=None,
+                ),
                 paddle.nn.Linear(
                     in_features=text_embed_dim,
                     out_features=out_embed_dim,
-                    bias_attr=False, ), ))
+                    bias_attr=False,
+                ),
+            )
+        )
         modality_heads[ModalityType.AUDIO] = paddle.nn.Sequential(
             paddle.nn.LayerNorm(
                 normalized_shape=audio_embed_dim,
                 epsilon=1e-06,
                 weight_attr=None,
-                bias_attr=None, ),
+                bias_attr=None,
+            ),
             SelectElement(index=0),
-            paddle.nn.Linear(
-                in_features=audio_embed_dim,
-                out_features=out_embed_dim,
-                bias_attr=False), )
+            paddle.nn.Linear(in_features=audio_embed_dim, out_features=out_embed_dim, bias_attr=False),
+        )
         modality_heads[ModalityType.DEPTH] = paddle.nn.Sequential(
             paddle.nn.LayerNorm(
                 normalized_shape=depth_embed_dim,
                 epsilon=1e-06,
                 weight_attr=None,
-                bias_attr=None, ),
+                bias_attr=None,
+            ),
             SelectElement(index=0),
-            paddle.nn.Linear(
-                in_features=depth_embed_dim,
-                out_features=out_embed_dim,
-                bias_attr=False), )
+            paddle.nn.Linear(in_features=depth_embed_dim, out_features=out_embed_dim, bias_attr=False),
+        )
         modality_heads[ModalityType.THERMAL] = paddle.nn.Sequential(
             paddle.nn.LayerNorm(
                 normalized_shape=thermal_embed_dim,
                 epsilon=1e-06,
                 weight_attr=None,
-                bias_attr=None, ),
+                bias_attr=None,
+            ),
             SelectElement(index=0),
             paddle.nn.Linear(
                 in_features=thermal_embed_dim,
                 out_features=out_embed_dim,
-                bias_attr=False, ), )
+                bias_attr=False,
+            ),
+        )
         modality_heads[ModalityType.IMU] = paddle.nn.Sequential(
             paddle.nn.LayerNorm(
                 normalized_shape=imu_embed_dim,
                 epsilon=1e-06,
                 weight_attr=None,
-                bias_attr=None, ),
+                bias_attr=None,
+            ),
             SelectElement(index=0),
             paddle.nn.Dropout(p=0.5),
-            paddle.nn.Linear(
-                in_features=imu_embed_dim,
-                out_features=out_embed_dim,
-                bias_attr=False), )
+            paddle.nn.Linear(in_features=imu_embed_dim, out_features=out_embed_dim, bias_attr=False),
+        )
         return paddle.nn.LayerDict(sublayers=modality_heads)
 
     def _create_modality_postprocessors(self, out_embed_dim):
         modality_postprocessors = {}
         modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
         modality_postprocessors[ModalityType.TEXT] = paddle.nn.Sequential(
-            Normalize(dim=-1), LearnableLogitScaling(learnable=True))
+            Normalize(dim=-1), LearnableLogitScaling(learnable=True)
+        )
         modality_postprocessors[ModalityType.AUDIO] = paddle.nn.Sequential(
             Normalize(dim=-1),
-            LearnableLogitScaling(
-                logit_scale_init=20.0, learnable=False), )
+            LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
+        )
         modality_postprocessors[ModalityType.DEPTH] = paddle.nn.Sequential(
             Normalize(dim=-1),
-            LearnableLogitScaling(
-                logit_scale_init=5.0, learnable=False), )
+            LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
+        )
         modality_postprocessors[ModalityType.THERMAL] = paddle.nn.Sequential(
             Normalize(dim=-1),
-            LearnableLogitScaling(
-                logit_scale_init=10.0, learnable=False), )
+            LearnableLogitScaling(logit_scale_init=10.0, learnable=False),
+        )
         modality_postprocessors[ModalityType.IMU] = paddle.nn.Sequential(
             Normalize(dim=-1),
-            LearnableLogitScaling(
-                logit_scale_init=5.0, learnable=False), )
+            LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
+        )
         return paddle.nn.LayerDict(sublayers=modality_postprocessors)
 
     def forward(self, inputs):
@@ -487,24 +524,18 @@ def forward(self, inputs):
             reduce_list = modality_value.ndim >= 5
             if reduce_list:
                 B, S = modality_value.shape[:2]
-                modality_value = modality_value.reshape(
-                    B * S, *modality_value.shape[2:])
+                modality_value = modality_value.reshape(B * S, *modality_value.shape[2:])
             if modality_value is not None:
-                modality_value = self.modality_preprocessors[modality_key](**{
-                    modality_key: modality_value
-                })
+                modality_value = self.modality_preprocessors[modality_key](**{modality_key: modality_value})
                 print(
                     f"modal: {modality_key}   paddle_modality_value['trunk']['tokens'].mean(): {modality_value['trunk']['tokens'].mean().item()}"
                 )
 
                 trunk_inputs = modality_value["trunk"]
                 head_inputs = modality_value["head"]
-                modality_value = self.modality_trunks[modality_key](
-                    **trunk_inputs)
-                modality_value = self.modality_heads[modality_key](
-                    modality_value, **head_inputs)
-                modality_value = self.modality_postprocessors[modality_key](
-                    modality_value)
+                modality_value = self.modality_trunks[modality_key](**trunk_inputs)
+                modality_value = self.modality_heads[modality_key](modality_value, **head_inputs)
+                modality_value = self.modality_postprocessors[modality_key](modality_value)
                 if reduce_list:
                     modality_value = modality_value.reshape(B, S, -1)
                     modality_value = modality_value.mean(axis=1)
diff --git a/paddlemix/models/imagebind/multimodal_modules.py b/paddlemix/models/imagebind/multimodal_modules.py
index 507b22caefa68..20d6f198208ca 100644
--- a/paddlemix/models/imagebind/multimodal_modules.py
+++ b/paddlemix/models/imagebind/multimodal_modules.py
@@ -12,11 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gzip
 import html
-import io
 import math
-import sys
 from functools import lru_cache
 from typing import Callable, List, Optional, Tuple
 
@@ -24,9 +21,6 @@
 import numpy as np
 import paddle
 import regex as re
-from iopath.common.file_io import g_pathmgr
-
-import paddlemix.utils.paddle_aux
 
 from .helpers import VerboseNNModule, cast_if_src_dtype
 
@@ -35,15 +29,12 @@ def get_sinusoid_encoding_table(n_position, d_hid):
     """Sinusoid position encoding table"""
 
     def get_position_angle_vec(position):
-        return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid))
-                for hid_j in range(d_hid)]
+        return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid)) for hid_j in range(d_hid)]
 
-    sinusoid_table = np.array(
-        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
-    return paddle.to_tensor(
-        data=sinusoid_table, dtype="float32").unsqueeze(axis=0)
+    return paddle.to_tensor(data=sinusoid_table, dtype="float32").unsqueeze(axis=0)
 
 
 def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
@@ -53,10 +44,10 @@ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
     dim = pos_embed.shape[-1]
     pos_embed, updated = cast_if_src_dtype(pos_embed, "bfloat16", "float32")
     pos_embed = paddle.nn.functional.interpolate(
-        x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
-                            dim).transpose(perm=[0, 3, 1, 2]),
+        x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).transpose(perm=[0, 3, 1, 2]),
         scale_factor=math.sqrt(target_spatial_size / N),
-        mode="bicubic", )
+        mode="bicubic",
+    )
     if updated:
         pos_embed, _ = cast_if_src_dtype(pos_embed, "float32", "bfloat16")
 
@@ -65,17 +56,12 @@ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
     return pos_embed
 
 
-def interpolate_pos_encoding(npatch_per_img,
-                             pos_embed,
-                             patches_layout,
-                             input_shape=None,
-                             first_patch_idx=1):
+def interpolate_pos_encoding(npatch_per_img, pos_embed, patches_layout, input_shape=None, first_patch_idx=1):
     assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none"
     N = pos_embed.shape[1] - first_patch_idx
     if npatch_per_img == N:
         return pos_embed
-    assert (patches_layout[-1] == patches_layout[-2]
-            ), "Interpolation of pos embed not supported for non-square layouts"
+    assert patches_layout[-1] == patches_layout[-2], "Interpolation of pos embed not supported for non-square layouts"
     class_emb = pos_embed[:, :first_patch_idx]
     pos_embed = pos_embed[:, first_patch_idx:]
     if input_shape is None or patches_layout[0] == 1:
@@ -87,24 +73,20 @@ def interpolate_pos_encoding(npatch_per_img,
 
         # pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1)
         pos_embed = pos_embed.reshape((1, num_frames, num_spatial_tokens, -1))
-        pos_embed = interpolate_pos_encoding_2d(
-            npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0))
+        pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0))
     else:
         raise ValueError("This type of interpolation isn't implemented")
     return paddle.concat(x=(class_emb, pos_embed), axis=1)
 
 
-def _get_pos_embedding(npatch_per_img,
-                       pos_embed,
-                       patches_layout,
-                       input_shape,
-                       first_patch_idx=1):
+def _get_pos_embedding(npatch_per_img, pos_embed, patches_layout, input_shape, first_patch_idx=1):
     pos_embed = interpolate_pos_encoding(
         npatch_per_img,
         pos_embed,
         patches_layout,
         input_shape=input_shape,
-        first_patch_idx=first_patch_idx, )
+        first_patch_idx=first_patch_idx,
+    )
     return pos_embed
 
 
@@ -113,7 +95,7 @@ class PatchEmbedGeneric(paddle.nn.Layer):
     PatchEmbed from Hydra
     """
 
-    def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer]=None):
+    def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer] = None):
         super().__init__()
         if len(proj_stem) > 1:
             self.proj = paddle.nn.Sequential(*proj_stem)
@@ -144,12 +126,13 @@ def forward(self, x):
 
 class SpatioTemporalPosEmbeddingHelper(VerboseNNModule):
     def __init__(
-            self,
-            patches_layout: List,
-            num_patches: int,
-            num_cls_tokens: int,
-            embed_dim: int,
-            learnable: bool, ) -> None:
+        self,
+        patches_layout: List,
+        num_patches: int,
+        num_cls_tokens: int,
+        embed_dim: int,
+        learnable: bool,
+    ) -> None:
         super().__init__()
         self.num_cls_tokens = num_cls_tokens
         self.patches_layout = patches_layout
@@ -161,14 +144,13 @@ def __init__(
             self.pos_embed = paddle.create_parameter(
                 shape=[1, self.num_tokens, embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
             paddle.nn.initializer.TruncatedNormal(std=0.02)(self.pos_embed)
             # import timm
             # timm.models.layers.trunc_normal_(self.pos_embed, std=0.02)
         else:
-            self.register_buffer(
-                "pos_embed",
-                get_sinusoid_encoding_table(self.num_tokens, embed_dim))
+            self.register_buffer("pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim))
 
     def get_pos_embedding(self, vision_input, all_vision_tokens):
         input_shape = vision_input.shape
@@ -177,24 +159,25 @@ def get_pos_embedding(self, vision_input, all_vision_tokens):
             pos_embed=self.pos_embed,
             patches_layout=self.patches_layout,
             input_shape=input_shape,
-            first_patch_idx=self.num_cls_tokens, )
+            first_patch_idx=self.num_cls_tokens,
+        )
         return pos_embed
 
 
 class RGBDTPreprocessor(VerboseNNModule):
     def __init__(
-            self,
-            rgbt_stem: PatchEmbedGeneric,
-            depth_stem: Optional[PatchEmbedGeneric],
-            img_size: Tuple=(3, 224, 224),
-            num_cls_tokens: int=1,
-            pos_embed_fn: Optional[Callable]=None,
-            use_type_embed: bool=False,
-            init_param_style: str="openclip", ) -> None:
+        self,
+        rgbt_stem: PatchEmbedGeneric,
+        depth_stem: Optional[PatchEmbedGeneric],
+        img_size: Tuple = (3, 224, 224),
+        num_cls_tokens: int = 1,
+        pos_embed_fn: Optional[Callable] = None,
+        use_type_embed: bool = False,
+        init_param_style: str = "openclip",
+    ) -> None:
         super().__init__()
         stem = rgbt_stem if rgbt_stem is not None else depth_stem
-        self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout(
-            img_size)
+        self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout(img_size)
         self.rgbt_stem = rgbt_stem
         self.depth_stem = depth_stem
         self.use_pos_embed = pos_embed_fn is not None
@@ -205,19 +188,22 @@ def __init__(
                 patches_layout=self.patches_layout,
                 num_cls_tokens=num_cls_tokens,
                 num_patches=self.num_patches,
-                embed_dim=self.embed_dim, )
+                embed_dim=self.embed_dim,
+            )
         if self.num_cls_tokens > 0:
 
             self.cls_token = paddle.create_parameter(
                 shape=[1, self.num_cls_tokens, self.embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         if self.use_type_embed:
 
             self.type_embed = paddle.create_parameter(
                 shape=[1, 1, self.embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         self.init_parameters(init_param_style)
 
     @paddle.no_grad()
@@ -225,11 +211,9 @@ def init_parameters(self, init_param_style):
         if init_param_style == "openclip":
             scale = self.embed_dim**-0.5
             if self.use_pos_embed:
-                paddle.nn.initializer.Normal()(
-                    self.pos_embedding_helper.pos_embed)
+                paddle.nn.initializer.Normal()(self.pos_embedding_helper.pos_embed)
 
-                self.pos_embedding_helper.pos_embed.set_value(
-                    self.pos_embedding_helper.pos_embed * scale)
+                self.pos_embedding_helper.pos_embed.set_value(self.pos_embedding_helper.pos_embed * scale)
             if self.num_cls_tokens > 0:
                 paddle.nn.initializer.Normal()(self.cls_token)
 
@@ -250,8 +234,7 @@ def tokenize_input_and_cls_pos(self, input, stem, mask):
             class_tokens = self.cls_token.expand(shape=[B, -1, -1])
             tokens = paddle.concat(x=(class_tokens, tokens), axis=1)
         if self.use_pos_embed:
-            pos_embed = self.pos_embedding_helper.get_pos_embedding(input,
-                                                                    tokens)
+            pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens)
             tokens = tokens + pos_embed
         if self.use_type_embed:
             tokens = tokens + self.type_embed.expand(shape=[B, -1, -1])
@@ -261,11 +244,9 @@ def forward(self, vision=None, depth=None, patch_mask=None):
         if patch_mask is not None:
             raise NotImplementedError()
         if vision is not None:
-            vision_tokens = self.tokenize_input_and_cls_pos(
-                vision, self.rgbt_stem, patch_mask)
+            vision_tokens = self.tokenize_input_and_cls_pos(vision, self.rgbt_stem, patch_mask)
         if depth is not None:
-            depth_tokens = self.tokenize_input_and_cls_pos(
-                depth, self.depth_stem, patch_mask)
+            depth_tokens = self.tokenize_input_and_cls_pos(depth, self.depth_stem, patch_mask)
         if vision is not None and depth is not None:
             final_tokens = vision_tokens + depth_tokens
         else:
@@ -303,14 +284,15 @@ def build_causal_attention_mask(context_length):
 
 class TextPreprocessor(VerboseNNModule):
     def __init__(
-            self,
-            vocab_size: int,
-            context_length: int,
-            embed_dim: int,
-            causal_masking: bool,
-            supply_seq_len_to_head: bool=True,
-            num_cls_tokens: int=0,
-            init_param_style: str="openclip", ) -> None:
+        self,
+        vocab_size: int,
+        context_length: int,
+        embed_dim: int,
+        causal_masking: bool,
+        supply_seq_len_to_head: bool = True,
+        num_cls_tokens: int = 0,
+        init_param_style: str = "openclip",
+    ) -> None:
         super().__init__()
         self.vocab_size = vocab_size
         self.context_length = context_length
@@ -319,8 +301,10 @@ def __init__(
         self.pos_embed = paddle.create_parameter(
             shape=[1, self.context_length + num_cls_tokens, embed_dim],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Assign(value=paddle.empty(
-                shape=[1, self.context_length + num_cls_tokens, embed_dim])), )
+            default_initializer=paddle.nn.initializer.Assign(
+                value=paddle.empty(shape=[1, self.context_length + num_cls_tokens, embed_dim])
+            ),
+        )
         self.causal_masking = causal_masking
         if self.causal_masking:
             mask = build_causal_attention_mask(self.context_length)
@@ -334,7 +318,8 @@ def __init__(
             self.cls_token = paddle.create_parameter(
                 shape=[1, self.num_cls_tokens, embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         self.init_parameters(init_param_style)
 
     @paddle.no_grad()
@@ -402,8 +387,7 @@ def forward(self, x):
                 x = x.tile(repeat_times=new_shape)
             elif self.pad_type == "zero":
                 padarg = [0, 0] * len(x.shape)
-                padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[
-                    self.time_dim]
+                padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim]
                 x = paddle.nn.functional.pad(x=x, pad=padarg)
         return x
 
@@ -419,9 +403,9 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
-    bs = (list(range(ord("!"), ord("~") + 1)) +
-          list(range(ord("¡"), ord("¬") + 1)) +
-          list(range(ord("®"), ord("ÿ") + 1)))
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
     cs = bs[:]
     n = 0
     for b in range(2**8):
@@ -457,122 +441,17 @@ def whitespace_clean(text):
     return text
 
 
-class SimpleTokenizer(object):
-    def __init__(self, bpe_path: str, context_length=77):
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with g_pathmgr.open(bpe_path, "rb") as fh:
-            bpe_bytes = io.BytesIO(fh.read())
-            merges: List[str] = gzip.open(bpe_bytes).read().decode(
-                "utf-8").split("\n")
-        merges = merges[1:49152 - 256 - 2 + 1]
-
-        merges: List[Tuple[str, ..
-                           .]] = [tuple(merge.split()) for merge in merges]
-        vocab = list(bytes_to_unicode().values())
-        vocab = vocab + [(v + "</w>") for v in vocab]
-        for merge in merges:
-            vocab.append("".join(merge))
-
-        vocab.extend(["<|startoftext|>", "<|endoftext|>"])
-        self.encoder = dict(zip(vocab, range(len(vocab))))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {
-            "<|startoftext|>": "<|startoftext|>",
-            "<|endoftext|>": "<|endoftext|>",
-        }
-        self.pat = re.compile(
-            "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+",
-            re.IGNORECASE, )
-        self.context_length = context_length
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-
-        word = tuple(token[:-1]) + (token[-1] + "</w>", )
-        pairs = get_pairs(word)
-        if not pairs:
-            return token + "</w>"
-        while True:
-            bigram = min(
-                pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word) - 1 and word[i +
-                                                                   1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        text = whitespace_clean(basic_clean(text)).lower()
-        for token in re.findall(self.pat, text):
-            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
-
-            bpe_tokens.extend(self.encoder[bpe_token]
-                              for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = "".join([self.decoder[token] for token in tokens])
-        text = (bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors="replace").replace("</w>", " "))
-        return text
-
-    def __call__(self, texts, context_length=None):
-        if not context_length:
-            context_length = self.context_length
-        if isinstance(texts, str):
-            texts = [texts]
-        sot_token = self.encoder["<|startoftext|>"]
-        eot_token = self.encoder["<|endoftext|>"]
-        all_tokens = [([sot_token] + self.encode(text) + [eot_token])
-                      for text in texts]
-        result = paddle.zeros(
-            shape=[len(all_tokens), context_length], dtype="int64")
-        for i, tokens in enumerate(all_tokens):
-            tokens = tokens[:context_length]
-            result[(i), :len(tokens)] = paddle.to_tensor(data=tokens)
-        if len(result) == 1:
-            return result[0]
-        return result
-
-
 class IMUPreprocessor(VerboseNNModule):
     def __init__(
-            self,
-            kernel_size: int,
-            imu_stem: PatchEmbedGeneric,
-            embed_dim: int,
-            img_size: Tuple=(6, 2000),
-            num_cls_tokens: int=1,
-            pos_embed_fn: Optional[Callable]=None,
-            init_param_style: str="openclip", ) -> None:
+        self,
+        kernel_size: int,
+        imu_stem: PatchEmbedGeneric,
+        embed_dim: int,
+        img_size: Tuple = (6, 2000),
+        num_cls_tokens: int = 1,
+        pos_embed_fn: Optional[Callable] = None,
+        init_param_style: str = "openclip",
+    ) -> None:
         super().__init__()
         self.imu_stem = imu_stem
         self.embed_dim = embed_dim
@@ -583,16 +462,17 @@ def __init__(
         self.pos_embed = paddle.create_parameter(
             shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Assign(value=paddle.empty(
-                shape=[
-                    1, img_size[1] // kernel_size + num_cls_tokens, embed_dim
-                ])), )
+            default_initializer=paddle.nn.initializer.Assign(
+                value=paddle.empty(shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim])
+            ),
+        )
         if self.num_cls_tokens > 0:
 
             self.cls_token = paddle.create_parameter(
                 shape=[1, self.num_cls_tokens, self.embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         self.init_parameters(init_param_style)
 
     @paddle.no_grad()
@@ -624,8 +504,7 @@ def tokenize_input_and_cls_pos(self, input, stem):
 
     def forward(self, imu):
 
-        imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose(
-            perm=[0, 2, 1, 3])  # 需要对齐
+        imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose(perm=[0, 2, 1, 3])  # 需要对齐
         imu = imu.reshape((imu.shape[0], imu.shape[1], -1))
         imu_tokens = self.tokenize_input_and_cls_pos(imu, self.imu_stem)
         return_dict = {"trunk": {"tokens": imu_tokens}, "head": {}}
diff --git a/paddlemix/models/imagebind/multimodal_preprocessors.py b/paddlemix/models/imagebind/multimodal_preprocessors.py
index 79910119e60ec..397adbd1d19cf 100644
--- a/paddlemix/models/imagebind/multimodal_preprocessors.py
+++ b/paddlemix/models/imagebind/multimodal_preprocessors.py
@@ -1,20 +1,18 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gzip
 import html
-import io
 import math
 from functools import lru_cache
 from typing import Callable, List, Optional, Tuple
@@ -23,7 +21,6 @@
 import numpy as np
 import paddle
 import regex as re
-from iopath.common.file_io import g_pathmgr
 
 from .helpers import VerboseNNModule, cast_if_src_dtype
 
@@ -32,15 +29,12 @@ def get_sinusoid_encoding_table(n_position, d_hid):
     """Sinusoid position encoding table"""
 
     def get_position_angle_vec(position):
-        return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid))
-                for hid_j in range(d_hid)]
+        return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid)) for hid_j in range(d_hid)]
 
-    sinusoid_table = np.array(
-        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
-    return paddle.to_tensor(
-        data=sinusoid_table, dtype='float32').unsqueeze(axis=0)
+    return paddle.to_tensor(data=sinusoid_table, dtype="float32").unsqueeze(axis=0)
 
 
 def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
@@ -48,60 +42,51 @@ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
     if N == target_spatial_size:
         return pos_embed
     dim = pos_embed.shape[-1]
-    pos_embed, updated = cast_if_src_dtype(pos_embed, 'bfloat16', 'float32')
+    pos_embed, updated = cast_if_src_dtype(pos_embed, "bfloat16", "float32")
     pos_embed = paddle.nn.functional.interpolate(
-        x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
-                            dim).transpose(perm=[0, 3, 1, 2]),
+        x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).transpose(perm=[0, 3, 1, 2]),
         scale_factor=math.sqrt(target_spatial_size / N),
-        mode='bicubic', )
+        mode="bicubic",
+    )
     if updated:
-        pos_embed, _ = cast_if_src_dtype(pos_embed, 'float32', 'bfloat16')
+        pos_embed, _ = cast_if_src_dtype(pos_embed, "float32", "bfloat16")
 
     # pos_embed = pos_embed.transpose(perm=[0, 2, 3, 1]).view(1, -1, dim)
     pos_embed = pos_embed.transpose(perm=[0, 2, 3, 1]).reshape((1, -1, dim))
     return pos_embed
 
 
-def interpolate_pos_encoding(npatch_per_img,
-                             pos_embed,
-                             patches_layout,
-                             input_shape=None,
-                             first_patch_idx=1):
-    assert first_patch_idx == 0 or first_patch_idx == 1, 'there is 1 CLS token or none'
+def interpolate_pos_encoding(npatch_per_img, pos_embed, patches_layout, input_shape=None, first_patch_idx=1):
+    assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none"
     N = pos_embed.shape[1] - first_patch_idx
     if npatch_per_img == N:
         return pos_embed
-    assert (patches_layout[-1] == patches_layout[-2]
-            ), 'Interpolation of pos embed not supported for non-square layouts'
+    assert patches_layout[-1] == patches_layout[-2], "Interpolation of pos embed not supported for non-square layouts"
     class_emb = pos_embed[:, :first_patch_idx]
     pos_embed = pos_embed[:, first_patch_idx:]
     if input_shape is None or patches_layout[0] == 1:
         pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed)
     elif patches_layout[0] > 1:
-        assert len(input_shape) == 4, 'temporal interpolation not supported'
+        assert len(input_shape) == 4, "temporal interpolation not supported"
         num_frames = patches_layout[0]
         num_spatial_tokens = patches_layout[1] * patches_layout[2]
 
         # pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1)
         pos_embed = pos_embed.reshape((1, num_frames, num_spatial_tokens, -1))
-        pos_embed = interpolate_pos_encoding_2d(
-            npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0))
+        pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0))
     else:
         raise ValueError("This type of interpolation isn't implemented")
     return paddle.concat(x=(class_emb, pos_embed), axis=1)
 
 
-def _get_pos_embedding(npatch_per_img,
-                       pos_embed,
-                       patches_layout,
-                       input_shape,
-                       first_patch_idx=1):
+def _get_pos_embedding(npatch_per_img, pos_embed, patches_layout, input_shape, first_patch_idx=1):
     pos_embed = interpolate_pos_encoding(
         npatch_per_img,
         pos_embed,
         patches_layout,
         input_shape=input_shape,
-        first_patch_idx=first_patch_idx, )
+        first_patch_idx=first_patch_idx,
+    )
     return pos_embed
 
 
@@ -110,7 +95,7 @@ class PatchEmbedGeneric(paddle.nn.Layer):
     PatchEmbed from Hydra
     """
 
-    def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer]=None):
+    def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer] = None):
         super().__init__()
         if len(proj_stem) > 1:
             self.proj = paddle.nn.Sequential(*proj_stem)
@@ -141,12 +126,13 @@ def forward(self, x):
 
 class SpatioTemporalPosEmbeddingHelper(VerboseNNModule):
     def __init__(
-            self,
-            patches_layout: List,
-            num_patches: int,
-            num_cls_tokens: int,
-            embed_dim: int,
-            learnable: bool, ) -> None:
+        self,
+        patches_layout: List,
+        num_patches: int,
+        num_cls_tokens: int,
+        embed_dim: int,
+        learnable: bool,
+    ) -> None:
         super().__init__()
         self.num_cls_tokens = num_cls_tokens
         self.patches_layout = patches_layout
@@ -158,13 +144,12 @@ def __init__(
             self.pos_embed = paddle.create_parameter(
                 shape=[1, self.num_tokens, embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
             paddle.nn.initializer.TruncatedNormal(std=0.02)(self.pos_embed)
 
         else:
-            self.register_buffer(
-                'pos_embed',
-                get_sinusoid_encoding_table(self.num_tokens, embed_dim))
+            self.register_buffer("pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim))
 
     def get_pos_embedding(self, vision_input, all_vision_tokens):
         input_shape = vision_input.shape
@@ -173,24 +158,25 @@ def get_pos_embedding(self, vision_input, all_vision_tokens):
             pos_embed=self.pos_embed,
             patches_layout=self.patches_layout,
             input_shape=input_shape,
-            first_patch_idx=self.num_cls_tokens, )
+            first_patch_idx=self.num_cls_tokens,
+        )
         return pos_embed
 
 
 class RGBDTPreprocessor(VerboseNNModule):
     def __init__(
-            self,
-            rgbt_stem: PatchEmbedGeneric,
-            depth_stem: Optional[PatchEmbedGeneric],
-            img_size: Tuple=(3, 224, 224),
-            num_cls_tokens: int=1,
-            pos_embed_fn: Optional[Callable]=None,
-            use_type_embed: bool=False,
-            init_param_style: str='openclip', ) -> None:
+        self,
+        rgbt_stem: PatchEmbedGeneric,
+        depth_stem: Optional[PatchEmbedGeneric],
+        img_size: Tuple = (3, 224, 224),
+        num_cls_tokens: int = 1,
+        pos_embed_fn: Optional[Callable] = None,
+        use_type_embed: bool = False,
+        init_param_style: str = "openclip",
+    ) -> None:
         super().__init__()
         stem = rgbt_stem if rgbt_stem is not None else depth_stem
-        self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout(
-            img_size)
+        self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout(img_size)
         self.rgbt_stem = rgbt_stem
         self.depth_stem = depth_stem
         self.use_pos_embed = pos_embed_fn is not None
@@ -201,39 +187,40 @@ def __init__(
                 patches_layout=self.patches_layout,
                 num_cls_tokens=num_cls_tokens,
                 num_patches=self.num_patches,
-                embed_dim=self.embed_dim, )
+                embed_dim=self.embed_dim,
+            )
         if self.num_cls_tokens > 0:
 
             self.cls_token = paddle.create_parameter(
                 shape=[1, self.num_cls_tokens, self.embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         if self.use_type_embed:
 
             self.type_embed = paddle.create_parameter(
                 shape=[1, 1, self.embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         self.init_parameters(init_param_style)
 
     @paddle.no_grad()
     def init_parameters(self, init_param_style):
-        if init_param_style == 'openclip':
+        if init_param_style == "openclip":
             scale = self.embed_dim**-0.5
             if self.use_pos_embed:
-                paddle.nn.initializer.Normal()(
-                    self.pos_embedding_helper.pos_embed)
+                paddle.nn.initializer.Normal()(self.pos_embedding_helper.pos_embed)
 
-                self.pos_embedding_helper.pos_embed.set_value(
-                    self.pos_embedding_helper.pos_embed * scale)
+                self.pos_embedding_helper.pos_embed.set_value(self.pos_embedding_helper.pos_embed * scale)
             if self.num_cls_tokens > 0:
                 paddle.nn.initializer.Normal()(self.cls_token)
 
                 self.cls_token.set_value(self.cls_token * scale)
-        elif init_param_style == 'vit':
+        elif init_param_style == "vit":
             self.cls_token.data.fill_(value=0)
         else:
-            raise ValueError(f'Unknown init {init_param_style}')
+            raise ValueError(f"Unknown init {init_param_style}")
         if self.use_type_embed:
             paddle.nn.initializer.Normal()(self.type_embed)
 
@@ -246,8 +233,7 @@ def tokenize_input_and_cls_pos(self, input, stem, mask):
             class_tokens = self.cls_token.expand(shape=[B, -1, -1])
             tokens = paddle.concat(x=(class_tokens, tokens), axis=1)
         if self.use_pos_embed:
-            pos_embed = self.pos_embedding_helper.get_pos_embedding(input,
-                                                                    tokens)
+            pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens)
             tokens = tokens + pos_embed
         if self.use_type_embed:
             tokens = tokens + self.type_embed.expand(shape=[B, -1, -1])
@@ -257,16 +243,14 @@ def forward(self, vision=None, depth=None, patch_mask=None):
         if patch_mask is not None:
             raise NotImplementedError()
         if vision is not None:
-            vision_tokens = self.tokenize_input_and_cls_pos(
-                vision, self.rgbt_stem, patch_mask)
+            vision_tokens = self.tokenize_input_and_cls_pos(vision, self.rgbt_stem, patch_mask)
         if depth is not None:
-            depth_tokens = self.tokenize_input_and_cls_pos(
-                depth, self.depth_stem, patch_mask)
+            depth_tokens = self.tokenize_input_and_cls_pos(depth, self.depth_stem, patch_mask)
         if vision is not None and depth is not None:
             final_tokens = vision_tokens + depth_tokens
         else:
             final_tokens = vision_tokens if vision is not None else depth_tokens
-        return_dict = {'trunk': {'tokens': final_tokens}, 'head': {}}
+        return_dict = {"trunk": {"tokens": final_tokens}, "head": {}}
         return return_dict
 
 
@@ -290,21 +274,22 @@ def build_causal_attention_mask(context_length):
     out_0 = paddle.empty(shape=[context_length, context_length])
     out_0.stop_gradient = not False
     mask = out_0
-    mask.fill_(value=float('-inf'))
+    mask.fill_(value=float("-inf"))
     mask = paddle.triu(mask, 1)
     return mask
 
 
 class TextPreprocessor(VerboseNNModule):
     def __init__(
-            self,
-            vocab_size: int,
-            context_length: int,
-            embed_dim: int,
-            causal_masking: bool,
-            supply_seq_len_to_head: bool=True,
-            num_cls_tokens: int=0,
-            init_param_style: str='openclip', ) -> None:
+        self,
+        vocab_size: int,
+        context_length: int,
+        embed_dim: int,
+        causal_masking: bool,
+        supply_seq_len_to_head: bool = True,
+        num_cls_tokens: int = 0,
+        init_param_style: str = "openclip",
+    ) -> None:
         super().__init__()
         self.vocab_size = vocab_size
         self.context_length = context_length
@@ -313,12 +298,14 @@ def __init__(
         self.pos_embed = paddle.create_parameter(
             shape=[1, self.context_length + num_cls_tokens, embed_dim],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Assign(value=paddle.empty(
-                shape=[1, self.context_length + num_cls_tokens, embed_dim])), )
+            default_initializer=paddle.nn.initializer.Assign(
+                value=paddle.empty(shape=[1, self.context_length + num_cls_tokens, embed_dim])
+            ),
+        )
         self.causal_masking = causal_masking
         if self.causal_masking:
             mask = build_causal_attention_mask(self.context_length)
-            self.register_buffer('mask', mask)
+            self.register_buffer("mask", mask)
         self.supply_seq_len_to_head = supply_seq_len_to_head
         self.num_cls_tokens = num_cls_tokens
         self.embed_dim = embed_dim
@@ -328,24 +315,25 @@ def __init__(
             self.cls_token = paddle.create_parameter(
                 shape=[1, self.num_cls_tokens, embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         self.init_parameters(init_param_style)
 
     @paddle.no_grad()
-    def init_parameters(self, init_param_style='openclip'):
+    def init_parameters(self, init_param_style="openclip"):
         paddle.nn.initializer.Normal(std=0.02)(self.token_embedding.weight)
         paddle.nn.initializer.Normal(std=0.01)(self.pos_embed)
 
-        if init_param_style == 'openclip':
+        if init_param_style == "openclip":
             scale = self.embed_dim**-0.5
             if self.num_cls_tokens > 0:
                 paddle.nn.initializer.Normal()(self.cls_token)
 
                 self.cls_token.set_value(self.cls_token * scale)
-        elif init_param_style == 'vit':
+        elif init_param_style == "vit":
             self.cls_token.data.fill_(value=0)
         else:
-            raise ValueError(f'Unknown init {init_param_style}')
+            raise ValueError(f"Unknown init {init_param_style}")
 
     def forward(self, text):
         text_tokens = self.token_embedding(text)
@@ -354,12 +342,12 @@ def forward(self, text):
             class_tokens = self.cls_token.expand(shape=[B, -1, -1])
             text_tokens = paddle.concat(x=(class_tokens, text_tokens), axis=1)
         text_tokens = text_tokens + self.pos_embed
-        return_dict = {'trunk': {'tokens': text_tokens}, 'head': {}}
+        return_dict = {"trunk": {"tokens": text_tokens}, "head": {}}
         if self.supply_seq_len_to_head:
             text_lengths = text.argmax(axis=-1)
-            return_dict['head'] = {'seq_len': text_lengths}
+            return_dict["head"] = {"seq_len": text_lengths}
         if self.causal_masking:
-            return_dict['trunk'].update({'attn_mask': self.mask})
+            return_dict["trunk"].update({"attn_mask": self.mask})
         return return_dict
 
 
@@ -376,28 +364,27 @@ def forward(self, x):
         elif x.ndim == 5:
             return x
         else:
-            raise ValueError(f'Dimension incorrect {x.shape}')
+            raise ValueError(f"Dimension incorrect {x.shape}")
 
 
 class PadIm2Video(Im2Video):
     def __init__(self, ntimes, pad_type, time_dim=2):
         super().__init__(time_dim=time_dim)
         assert ntimes > 0
-        assert pad_type in ['zero', 'repeat']
+        assert pad_type in ["zero", "repeat"]
         self.ntimes = ntimes
         self.pad_type = pad_type
 
     def forward(self, x):
         x = super().forward(x)
         if x.shape[self.time_dim] == 1:
-            if self.pad_type == 'repeat':
+            if self.pad_type == "repeat":
                 new_shape = [1] * len(x.shape)
                 new_shape[self.time_dim] = self.ntimes
                 x = x.tile(repeat_times=new_shape)
-            elif self.pad_type == 'zero':
+            elif self.pad_type == "zero":
                 padarg = [0, 0] * len(x.shape)
-                padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[
-                    self.time_dim]
+                padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim]
                 x = paddle.nn.functional.pad(x=x, pad=padarg)
         return x
 
@@ -413,9 +400,9 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
-    bs = (list(range(ord('!'), ord('~') + 1)) +
-          list(range(ord('¡'), ord('¬') + 1)) +
-          list(range(ord('®'), ord('ÿ') + 1)))
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
     cs = bs[:]
     n = 0
     for b in range(2**8):
@@ -446,127 +433,22 @@ def basic_clean(text):
 
 
 def whitespace_clean(text):
-    text = re.sub('\\s+', ' ', text)
+    text = re.sub("\\s+", " ", text)
     text = text.strip()
     return text
 
 
-class SimpleTokenizer(object):
-    def __init__(self, bpe_path: str, context_length=77):
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with g_pathmgr.open(bpe_path, 'rb') as fh:
-            bpe_bytes = io.BytesIO(fh.read())
-            merges: List[str] = gzip.open(bpe_bytes).read().decode(
-                'utf-8').split('\n')
-        merges = merges[1:49152 - 256 - 2 + 1]
-
-        merges: List[Tuple[str, ..
-                           .]] = [tuple(merge.split()) for merge in merges]
-        vocab = list(bytes_to_unicode().values())
-        vocab = vocab + [(v + '</w>') for v in vocab]
-        for merge in merges:
-            vocab.append(''.join(merge))
-
-        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
-        self.encoder = dict(zip(vocab, range(len(vocab))))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {
-            '<|startoftext|>': '<|startoftext|>',
-            '<|endoftext|>': '<|endoftext|>',
-        }
-        self.pat = re.compile(
-            "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+",
-            re.IGNORECASE, )
-        self.context_length = context_length
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-
-        word = tuple(token[:-1]) + (token[-1] + '</w>', )
-        pairs = get_pairs(word)
-        if not pairs:
-            return token + '</w>'
-        while True:
-            bigram = min(
-                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word) - 1 and word[i +
-                                                                   1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        text = whitespace_clean(basic_clean(text)).lower()
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-
-            bpe_tokens.extend(self.encoder[bpe_token]
-                              for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = (bytearray([self.byte_decoder[c] for c in text]).decode(
-            'utf-8', errors='replace').replace('</w>', ' '))
-        return text
-
-    def __call__(self, texts, context_length=None):
-        if not context_length:
-            context_length = self.context_length
-        if isinstance(texts, str):
-            texts = [texts]
-        sot_token = self.encoder['<|startoftext|>']
-        eot_token = self.encoder['<|endoftext|>']
-        all_tokens = [([sot_token] + self.encode(text) + [eot_token])
-                      for text in texts]
-        result = paddle.zeros(
-            shape=[len(all_tokens), context_length], dtype='int64')
-        for i, tokens in enumerate(all_tokens):
-            tokens = tokens[:context_length]
-            result[(i), :len(tokens)] = paddle.to_tensor(data=tokens)
-        if len(result) == 1:
-            return result[0]
-        return result
-
-
 class IMUPreprocessor(VerboseNNModule):
     def __init__(
-            self,
-            kernel_size: int,
-            imu_stem: PatchEmbedGeneric,
-            embed_dim: int,
-            img_size: Tuple=(6, 2000),
-            num_cls_tokens: int=1,
-            pos_embed_fn: Optional[Callable]=None,
-            init_param_style: str='openclip', ) -> None:
+        self,
+        kernel_size: int,
+        imu_stem: PatchEmbedGeneric,
+        embed_dim: int,
+        img_size: Tuple = (6, 2000),
+        num_cls_tokens: int = 1,
+        pos_embed_fn: Optional[Callable] = None,
+        init_param_style: str = "openclip",
+    ) -> None:
         super().__init__()
         self.imu_stem = imu_stem
         self.embed_dim = embed_dim
@@ -577,32 +459,33 @@ def __init__(
         self.pos_embed = paddle.create_parameter(
             shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Assign(value=paddle.empty(
-                shape=[
-                    1, img_size[1] // kernel_size + num_cls_tokens, embed_dim
-                ])), )
+            default_initializer=paddle.nn.initializer.Assign(
+                value=paddle.empty(shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim])
+            ),
+        )
         if self.num_cls_tokens > 0:
 
             self.cls_token = paddle.create_parameter(
                 shape=[1, self.num_cls_tokens, self.embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
         self.init_parameters(init_param_style)
 
     @paddle.no_grad()
     def init_parameters(self, init_param_style):
         paddle.nn.initializer.TruncatedNormal(std=0.01)(self.pos_embed)
 
-        if init_param_style == 'openclip':
+        if init_param_style == "openclip":
             scale = self.embed_dim**-0.5
             if self.num_cls_tokens > 0:
                 paddle.nn.initializer.TruncatedNormal()(self.cls_token)
 
                 self.cls_token.set_value(self.cls_token * scale)
-        elif init_param_style == 'vit':
+        elif init_param_style == "vit":
             self.cls_token.data.fill_(value=0)
         else:
-            raise ValueError(f'Unknown init {init_param_style}')
+            raise ValueError(f"Unknown init {init_param_style}")
 
     def tokenize_input_and_cls_pos(self, input, stem):
         tokens = stem.norm_layer(stem.proj(input))
@@ -618,9 +501,8 @@ def tokenize_input_and_cls_pos(self, input, stem):
 
     def forward(self, imu):
 
-        imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose(
-            perm=[0, 2, 1, 3])  # 需要对齐
+        imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose(perm=[0, 2, 1, 3])  # 需要对齐
         imu = imu.reshape((imu.shape[0], imu.shape[1], -1))
         imu_tokens = self.tokenize_input_and_cls_pos(imu, self.imu_stem)
-        return_dict = {'trunk': {'tokens': imu_tokens}, 'head': {}}
+        return_dict = {"trunk": {"tokens": imu_tokens}, "head": {}}
         return return_dict
diff --git a/paddlemix/models/imagebind/transformer.py b/paddlemix/models/imagebind/transformer.py
index 59e64bc4c9f8a..bb2fdae67b0d9 100644
--- a/paddlemix/models/imagebind/transformer.py
+++ b/paddlemix/models/imagebind/transformer.py
@@ -21,36 +21,35 @@
 
 class Attention(paddle.nn.Layer):
     def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=False,
-            qk_scale=None,
-            attn_drop=0.0,
-            proj_drop=0.0, ):
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
         self.scale = qk_scale or head_dim**-0.5
-        self.qkv = paddle.nn.Linear(
-            in_features=dim, out_features=dim * 3, bias_attr=qkv_bias)
+        self.qkv = paddle.nn.Linear(in_features=dim, out_features=dim * 3, bias_attr=qkv_bias)
         self.attn_drop = paddle.nn.Dropout(p=attn_drop)
         self.proj = paddle.nn.Linear(in_features=dim, out_features=dim)
         self.proj_drop = paddle.nn.Dropout(p=proj_drop)
 
     def forward(self, x):
         B, N, C = x.shape
-        qkv = (self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
-               .transpose(perm=[2, 0, 3, 1, 4]))
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(perm=[2, 0, 3, 1, 4])
         q, k, v = qkv[0], qkv[1], qkv[2]
         x = k
         perm_2 = list(range(x.ndim))
         perm_2[-2] = -1
         perm_2[-1] = -2
-        attn = q @x.transpose(perm=perm_2) * self.scale
+        attn = q @ x.transpose(perm=perm_2) * self.scale
         attn = paddle.nn.functional.softmax(attn, axis=-1)
         attn = self.attn_drop(attn)
-        x = attn @v
+        x = attn @ v
         perm_3 = list(range(x.ndim))
         perm_3[1] = 2
         perm_3[2] = 1
@@ -62,20 +61,19 @@ def forward(self, x):
 
 class Mlp(paddle.nn.Layer):
     def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=paddle.nn.GELU,
-            drop=0.0, ):
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=paddle.nn.GELU,
+        drop=0.0,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
-        self.fc1 = paddle.nn.Linear(
-            in_features=in_features, out_features=hidden_features)
+        self.fc1 = paddle.nn.Linear(in_features=in_features, out_features=hidden_features)
         self.act = act_layer()
-        self.fc2 = paddle.nn.Linear(
-            in_features=hidden_features, out_features=out_features)
+        self.fc2 = paddle.nn.Linear(in_features=hidden_features, out_features=out_features)
         self.drop = paddle.nn.Dropout(p=drop)
 
     def forward(self, x):
@@ -89,31 +87,28 @@ def forward(self, x):
 
 class MultiheadAttention(paddle.nn.MultiHeadAttention):
     def __init__(self, embed_dim, num_heads, *arg, add_bias_kv=None, **kwargs):
-        super(MultiheadAttention, self).__init__(embed_dim, num_heads, *arg,
-                                                 **kwargs)
+        super(MultiheadAttention, self).__init__(embed_dim, num_heads, *arg, **kwargs)
         self.add_bias_kv = add_bias_kv
         self.embed_dim = embed_dim
         if self.add_bias_kv:
             self.bias_k = paddle.create_parameter(
                 shape=[1, 1, embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
             self.bias_v = paddle.create_parameter(
                 shape=[1, 1, embed_dim],
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=0.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=0.0),
+            )
 
     def compute_kv(self, key, value):
         k = self.k_proj(key)
         v = self.v_proj(value)
         bsz, _, _ = k.shape
         if self.add_bias_kv:
-            k = paddle.concat(
-                [k, paddle.repeat_interleave(
-                    self.bias_k, bsz, axis=0)], axis=1)
-            v = paddle.concat(
-                [v, paddle.repeat_interleave(
-                    self.bias_v, bsz, axis=0)], axis=1)
+            k = paddle.concat([k, paddle.repeat_interleave(self.bias_k, bsz, axis=0)], axis=1)
+            v = paddle.concat([v, paddle.repeat_interleave(self.bias_v, bsz, axis=0)], axis=1)
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
@@ -122,8 +117,7 @@ def compute_kv(self, key, value):
 
     def forward(self, x: paddle.Tensor, attn_mask: paddle.Tensor):
         # x = paddle.transpose(x, perm=[1,0, 2])
-        return super(MultiheadAttention, self).forward(
-            x, x, x, attn_mask=attn_mask)
+        return super(MultiheadAttention, self).forward(x, x, x, attn_mask=attn_mask)
 
 
 class ViTAttention(Attention):
@@ -142,16 +136,17 @@ def forward(self, x):
 
 class BlockWithMasking(paddle.nn.Layer):
     def __init__(
-            self,
-            dim: int,
-            attn_target: Callable,
-            mlp_ratio: int=4,
-            act_layer: Callable=paddle.nn.GELU,
-            norm_layer: Callable=paddle.nn.LayerNorm,
-            ffn_dropout_rate: float=0.0,
-            drop_path: float=0.0,
-            layer_scale_type: Optional[str]=None,
-            layer_scale_init_value: float=0.0001, ):
+        self,
+        dim: int,
+        attn_target: Callable,
+        mlp_ratio: int = 4,
+        act_layer: Callable = paddle.nn.GELU,
+        norm_layer: Callable = paddle.nn.LayerNorm,
+        ffn_dropout_rate: float = 0.0,
+        drop_path: float = 0.0,
+        layer_scale_type: Optional[str] = None,
+        layer_scale_init_value: float = 0.0001,
+    ):
         super().__init__()
         assert not isinstance(
             attn_target, paddle.nn.Layer
@@ -167,7 +162,8 @@ def __init__(
             in_features=dim,
             hidden_features=mlp_hidden_dim,
             act_layer=act_layer,
-            drop=ffn_dropout_rate, )
+            drop=ffn_dropout_rate,
+        )
         self.norm_2 = norm_layer(dim)
         self.layer_scale_type = layer_scale_type
         if self.layer_scale_type is not None:
@@ -183,21 +179,21 @@ def __init__(
             self.layer_scale_gamma1 = paddle.create_parameter(
                 shape=gamma_shape,
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=1.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=1.0),
+            )
             self.layer_scale_gamma2 = paddle.create_parameter(
                 shape=gamma_shape,
                 dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(value=1.0), )
+                default_initializer=paddle.nn.initializer.Constant(value=1.0),
+            )
 
     def forward(self, x: paddle.Tensor, attn_mask: paddle.Tensor):
         if self.layer_scale_type is None:
             x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask))
             x = x + self.drop_path(self.mlp(self.norm_2(x)))
         else:
-            x = (x + self.drop_path(self.attn(self.norm_1(x), attn_mask)) *
-                 self.layer_scale_gamma1)
-            x = x + self.drop_path(self.mlp(self.norm_2(
-                x))) * self.layer_scale_gamma2
+            x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask)) * self.layer_scale_gamma1
+            x = x + self.drop_path(self.mlp(self.norm_2(x))) * self.layer_scale_gamma2
         return x
 
 
@@ -206,21 +202,22 @@ def forward(self, x: paddle.Tensor, attn_mask: paddle.Tensor):
 
 class SimpleTransformer(paddle.nn.Layer):
     def __init__(
-            self,
-            attn_target: Callable,
-            embed_dim: int,
-            num_blocks: int,
-            block: Callable=BlockWithMasking,
-            pre_transformer_layer: Optional[Callable]=None,
-            post_transformer_layer: Optional[Callable]=None,
-            drop_path_rate: float=0.0,
-            drop_path_type: str="progressive",
-            norm_layer: Callable=_LAYER_NORM,
-            mlp_ratio: int=4,
-            ffn_dropout_rate: float=0.0,
-            layer_scale_type: Optional[str]=None,
-            layer_scale_init_value: float=0.0001,
-            weight_init_style: str="jax", ):
+        self,
+        attn_target: Callable,
+        embed_dim: int,
+        num_blocks: int,
+        block: Callable = BlockWithMasking,
+        pre_transformer_layer: Optional[Callable] = None,
+        post_transformer_layer: Optional[Callable] = None,
+        drop_path_rate: float = 0.0,
+        drop_path_type: str = "progressive",
+        norm_layer: Callable = _LAYER_NORM,
+        mlp_ratio: int = 4,
+        ffn_dropout_rate: float = 0.0,
+        layer_scale_type: Optional[str] = None,
+        layer_scale_init_value: float = 0.0001,
+        weight_init_style: str = "jax",
+    ):
         """
         Simple Transformer with the following features
         1. Supports masked attention
@@ -232,27 +229,26 @@ def __init__(
         super().__init__()
         self.pre_transformer_layer = pre_transformer_layer
         if drop_path_type == "progressive":
-            dpr = [
-                x.item()
-                for x in paddle.linspace(
-                    start=0, stop=drop_path_rate, num=num_blocks)
-            ]
+            dpr = [x.item() for x in paddle.linspace(start=0, stop=drop_path_rate, num=num_blocks)]
         elif drop_path_type == "uniform":
             dpr = [drop_path_rate for i in range(num_blocks)]
         else:
             raise ValueError(f"Unknown drop_path_type: {drop_path_type}")
-        self.blocks = paddle.nn.Sequential(* [
-            block(
-                dim=embed_dim,
-                attn_target=attn_target,
-                mlp_ratio=mlp_ratio,
-                ffn_dropout_rate=ffn_dropout_rate,
-                drop_path=dpr[i],
-                norm_layer=norm_layer,
-                layer_scale_type=layer_scale_type,
-                layer_scale_init_value=layer_scale_init_value, )
-            for i in range(num_blocks)
-        ])
+        self.blocks = paddle.nn.Sequential(
+            *[
+                block(
+                    dim=embed_dim,
+                    attn_target=attn_target,
+                    mlp_ratio=mlp_ratio,
+                    ffn_dropout_rate=ffn_dropout_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    layer_scale_type=layer_scale_type,
+                    layer_scale_init_value=layer_scale_init_value,
+                )
+                for i in range(num_blocks)
+            ]
+        )
         self.post_transformer_layer = post_transformer_layer
         self.weight_init_style = weight_init_style
         self.apply(self._init_weights)
@@ -273,12 +269,13 @@ def _init_weights(self, m):
             paddle.nn.initializer.Constant(value=1.0)(m.weight)
 
     def forward(
-            self,
-            tokens: paddle.Tensor,
-            attn_mask: paddle.Tensor=None,
-            use_checkpoint: bool=False,
-            checkpoint_every_n: int=1,
-            checkpoint_blk_ids: Optional[List[int]]=None, ):
+        self,
+        tokens: paddle.Tensor,
+        attn_mask: paddle.Tensor = None,
+        use_checkpoint: bool = False,
+        checkpoint_every_n: int = 1,
+        checkpoint_blk_ids: Optional[List[int]] = None,
+    ):
         """
         Inputs
         - tokens: data of shape N x L x D (or L x N x D depending on the attention implementation)
@@ -290,10 +287,7 @@ def forward(
         if self.pre_transformer_layer:
             tokens = self.pre_transformer_layer(tokens)
         if use_checkpoint and checkpoint_blk_ids is None:
-            checkpoint_blk_ids = [
-                blk_id for blk_id in range(len(self.blocks))
-                if blk_id % checkpoint_every_n == 0
-            ]
+            checkpoint_blk_ids = [blk_id for blk_id in range(len(self.blocks)) if blk_id % checkpoint_every_n == 0]
         if checkpoint_blk_ids:
             checkpoint_blk_ids = set(checkpoint_blk_ids)
         for blk_id, blk in enumerate(self.blocks):
diff --git a/paddlemix/models/imagebind/utils/kaldi.py b/paddlemix/models/imagebind/utils/kaldi.py
index 1b6950f2ecc61..59a53dc16407a 100644
--- a/paddlemix/models/imagebind/utils/kaldi.py
+++ b/paddlemix/models/imagebind/utils/kaldi.py
@@ -45,13 +45,10 @@ def _get_epsilon(device, dtype):
 
 def _next_power_of_2(x: int) -> int:
     """Returns the smallest power of 2 that is greater than x"""
-    return 1 if x == 0 else 2**(x - 1).bit_length()
+    return 1 if x == 0 else 2 ** (x - 1).bit_length()
 
 
-def _get_strided(waveform: paddle.Tensor,
-                 window_size: int,
-                 window_shift: int,
-                 snip_edges: bool) -> paddle.Tensor:
+def _get_strided(waveform: paddle.Tensor, window_size: int, window_shift: int, snip_edges: bool) -> paddle.Tensor:
     """Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
     representing how the window is shifted along the waveform. Each row is a frame.
 
@@ -90,15 +87,12 @@ def _get_strided(waveform: paddle.Tensor,
     return waveform.as_strided(sizes, strides)
 
 
-def _feature_window_function(window_type: str,
-                             window_size: int,
-                             blackman_coeff: float,
-                             device: str,
-                             dtype: int) -> paddle.Tensor:
+def _feature_window_function(
+    window_type: str, window_size: int, blackman_coeff: float, device: str, dtype: int
+) -> paddle.Tensor:
     """Returns a window function with the given type and size"""
     if window_type == HANNING:
-        return paddle.hann_window(
-            window_size, periodic=False, device=device, dtype=dtype)
+        return paddle.hann_window(window_size, periodic=False, device=device, dtype=dtype)
     elif window_type == HAMMING:
         return paddle.hamming_window(
             window_size,
@@ -106,82 +100,79 @@ def _feature_window_function(window_type: str,
             alpha=0.54,
             beta=0.46,
             device=device,
-            dtype=dtype, )
+            dtype=dtype,
+        )
     elif window_type == POVEY:
-        return paddle.hann_window(
-            window_size, periodic=False, device=device, dtype=dtype).pow(y=0.85)
+        return paddle.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(y=0.85)
     elif window_type == RECTANGULAR:
         return paddle.ones(shape=window_size, dtype=dtype)
 
     elif window_type == BLACKMAN:
         a = 2 * math.pi / (window_size - 1)
         window_function = paddle.arange(end=window_size).astype(dtype)
-        return (blackman_coeff - 0.5 * paddle.cos(x=a * window_function) +
-                (0.5 - blackman_coeff) * paddle.cos(x=2 * a * window_function))
+        return (
+            blackman_coeff
+            - 0.5 * paddle.cos(x=a * window_function)
+            + (0.5 - blackman_coeff) * paddle.cos(x=2 * a * window_function)
+        )
     else:
         raise Exception("Invalid window type " + window_type)
 
 
-def _get_log_energy(strided_input: paddle.Tensor,
-                    epsilon: paddle.Tensor,
-                    energy_floor: float) -> paddle.Tensor:
+def _get_log_energy(strided_input: paddle.Tensor, epsilon: paddle.Tensor, energy_floor: float) -> paddle.Tensor:
     """Returns the log energy of size (m) for a strided_input (m,*)"""
     device, dtype = strided_input.place, strided_input.dtype
-    log_energy = paddle.maximum(
-        x=strided_input.pow(y=2).sum(axis=1), y=epsilon).log()
+    log_energy = paddle.maximum(x=strided_input.pow(y=2).sum(axis=1), y=epsilon).log()
     if energy_floor == 0.0:
         return log_energy
     return paddle.maximum(
         x=log_energy,
-        y=paddle.to_tensor(
-            data=math.log(energy_floor), dtype=dtype, place=device), )
+        y=paddle.to_tensor(data=math.log(energy_floor), dtype=dtype, place=device),
+    )
 
 
 def _get_waveform_and_window_properties(
-        waveform: paddle.Tensor,
-        channel: int,
-        sample_frequency: float,
-        frame_shift: float,
-        frame_length: float,
-        round_to_power_of_two: bool,
-        preemphasis_coefficient: float, ) -> Tuple[paddle.Tensor, int, int,
-                                                   int]:
+    waveform: paddle.Tensor,
+    channel: int,
+    sample_frequency: float,
+    frame_shift: float,
+    frame_length: float,
+    round_to_power_of_two: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[paddle.Tensor, int, int, int]:
     """Gets the waveform and window properties"""
     channel = max(channel, 0)
-    assert channel < waveform.shape[0], "Invalid channel {} for size {}".format(
-        channel, waveform.shape[0])
+    assert channel < waveform.shape[0], "Invalid channel {} for size {}".format(channel, waveform.shape[0])
     waveform = waveform[(channel), :]
     window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
     window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
-    padded_window_size = (_next_power_of_2(window_size)
-                          if round_to_power_of_two else window_size)
-    assert (2 <= window_size <= len(waveform)
-            ), "choose a window size {} that is [2, {}]".format(window_size,
-                                                                len(waveform))
+    padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
+    assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
+        window_size, len(waveform)
+    )
     assert 0 < window_shift, "`window_shift` must be greater than 0"
     assert (
         padded_window_size % 2 == 0
     ), "the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`"
-    assert (0.0 <= preemphasis_coefficient <= 1.0
-            ), "`preemphasis_coefficient` must be between [0,1]"
+    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
     assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
     return waveform, window_shift, window_size, padded_window_size
 
 
 def _get_window(
-        waveform: paddle.Tensor,
-        padded_window_size: int,
-        window_size: int,
-        window_shift: int,
-        window_type: str,
-        blackman_coeff: float,
-        snip_edges: bool,
-        raw_energy: bool,
-        energy_floor: float,
-        dither: float,
-        remove_dc_offset: bool,
-        preemphasis_coefficient: float, ) -> Tuple[paddle.Tensor,
-                                                   paddle.Tensor]:
+    waveform: paddle.Tensor,
+    padded_window_size: int,
+    window_size: int,
+    window_shift: int,
+    window_type: str,
+    blackman_coeff: float,
+    snip_edges: bool,
+    raw_energy: bool,
+    energy_floor: float,
+    dither: float,
+    remove_dc_offset: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[paddle.Tensor, paddle.Tensor]:
     """Gets a window and its log energy
 
     Returns:
@@ -189,30 +180,25 @@ def _get_window(
     """
     device, dtype = waveform.place, waveform.dtype
     epsilon = _get_epsilon(device, dtype)
-    strided_input = _get_strided(waveform, window_size, window_shift,
-                                 snip_edges)
+    strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
     if dither != 0.0:
-        x = paddle.maximum(
-            x=epsilon, y=paddle.rand(
-                shape=strided_input.shape, dtype=dtype))
+        x = paddle.maximum(x=epsilon, y=paddle.rand(shape=strided_input.shape, dtype=dtype))
         rand_gauss = paddle.sqrt(x=-2 * x.log()) * paddle.cos(x=2 * math.pi * x)
         strided_input = strided_input + rand_gauss * dither
     if remove_dc_offset:
         row_means = paddle.mean(x=strided_input, axis=1).unsqueeze(axis=1)
         strided_input = strided_input - row_means
     if raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)
     if preemphasis_coefficient != 0.0:
         offset_strided_input = paddle.pad_from_torch(
-            strided_input.unsqueeze(axis=0), (1, 0),
-            mode="replicate").squeeze(axis=0)
-
-        strided_input = (strided_input - preemphasis_coefficient *
-                         offset_strided_input[:, :-1])
-    window_function = _feature_window_function(window_type, window_size,
-                                               blackman_coeff, device,
-                                               dtype).unsqueeze(axis=0)
+            strided_input.unsqueeze(axis=0), (1, 0), mode="replicate"
+        ).squeeze(axis=0)
+
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
+    window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
+        axis=0
+    )
     strided_input = strided_input * window_function
     if padded_window_size != window_size:
         padding_right = padded_window_size - window_size
@@ -220,15 +206,14 @@ def _get_window(
             strided_input.unsqueeze(axis=0),
             (0, padding_right),
             mode="constant",
-            value=0, ).squeeze(axis=0)
+            value=0,
+        ).squeeze(axis=0)
     if not raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)
     return strided_input, signal_log_energy
 
 
-def _subtract_column_mean(tensor: paddle.Tensor,
-                          subtract_mean: bool) -> paddle.Tensor:
+def _subtract_column_mean(tensor: paddle.Tensor, subtract_mean: bool) -> paddle.Tensor:
     if subtract_mean:
         col_means = paddle.mean(x=tensor, axis=0).unsqueeze(axis=0)
         tensor = tensor - col_means
@@ -236,22 +221,23 @@ def _subtract_column_mean(tensor: paddle.Tensor,
 
 
 def spectrogram(
-        waveform: paddle.Tensor,
-        blackman_coeff: float=0.42,
-        channel: int=-1,
-        dither: float=0.0,
-        energy_floor: float=1.0,
-        frame_length: float=25.0,
-        frame_shift: float=10.0,
-        min_duration: float=0.0,
-        preemphasis_coefficient: float=0.97,
-        raw_energy: bool=True,
-        remove_dc_offset: bool=True,
-        round_to_power_of_two: bool=True,
-        sample_frequency: float=16000.0,
-        snip_edges: bool=True,
-        subtract_mean: bool=False,
-        window_type: str=POVEY, ) -> paddle.Tensor:
+    waveform: paddle.Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    min_duration: float = 0.0,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    window_type: str = POVEY,
+) -> paddle.Tensor:
     """Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
     compute-spectrogram-feats.
 
@@ -288,18 +274,15 @@ def spectrogram(
     """
     device, dtype = waveform.place, waveform.dtype
     epsilon = _get_epsilon(device, dtype)
-    (
+    (waveform, window_shift, window_size, padded_window_size,) = _get_waveform_and_window_properties(
         waveform,
-        window_shift,
-        window_size,
-        padded_window_size, ) = _get_waveform_and_window_properties(
-            waveform,
-            channel,
-            sample_frequency,
-            frame_shift,
-            frame_length,
-            round_to_power_of_two,
-            preemphasis_coefficient, )
+        channel,
+        sample_frequency,
+        frame_shift,
+        frame_length,
+        round_to_power_of_two,
+        preemphasis_coefficient,
+    )
     if len(waveform) < min_duration * sample_frequency:
         return paddle.empty(shape=[0])
     strided_input, signal_log_energy = _get_window(
@@ -314,7 +297,8 @@ def spectrogram(
         energy_floor,
         dither,
         remove_dc_offset,
-        preemphasis_coefficient, )
+        preemphasis_coefficient,
+    )
     fft = paddle.fft.rfft(x=strided_input)
     power_spectrum = paddle.maximum(x=fft.abs().pow(y=2.0), y=epsilon).log()
     power_spectrum[:, (0)] = signal_log_energy
@@ -339,12 +323,13 @@ def mel_scale(freq: paddle.Tensor) -> paddle.Tensor:
 
 
 def vtln_warp_freq(
-        vtln_low_cutoff: float,
-        vtln_high_cutoff: float,
-        low_freq: float,
-        high_freq: float,
-        vtln_warp_factor: float,
-        freq: paddle.Tensor, ) -> paddle.Tensor:
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_warp_factor: float,
+    freq: paddle.Tensor,
+) -> paddle.Tensor:
     """This computes a VTLN warping function that is not the same as HTK's one,
     but has similar inputs (this function has the advantage of never producing
     empty bins).
@@ -381,11 +366,8 @@ def vtln_warp_freq(
     Returns:
         Tensor: Freq after vtln warp
     """
-    assert (vtln_low_cutoff > low_freq
-            ), "be sure to set the vtln_low option higher than low_freq"
-    assert (
-        vtln_high_cutoff < high_freq
-    ), "be sure to set the vtln_high option lower than high_freq [or negative]"
+    assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
+    assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
     l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
     h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
     scale = 1.0 / vtln_warp_factor
@@ -395,9 +377,9 @@ def vtln_warp_freq(
     scale_left = (Fl - low_freq) / (l - low_freq)
     scale_right = (high_freq - Fh) / (high_freq - h)
     res = paddle.empty_like(x=freq)
-    outside_low_high_freq = paddle.less_than(
-        x=freq, y=paddle.to_tensor(low_freq)) | paddle.greater_than(
-            x=freq, y=paddle.to_tensor(high_freq))
+    outside_low_high_freq = paddle.less_than(x=freq, y=paddle.to_tensor(low_freq)) | paddle.greater_than(
+        x=freq, y=paddle.to_tensor(high_freq)
+    )
     before_l = paddle.less_than(x=freq, y=paddle.to_tensor(l))
     before_h = paddle.less_than(x=freq, y=paddle.to_tensor(h))
     after_h = paddle.greater_equal(x=freq, y=paddle.to_tensor(h))
@@ -409,12 +391,13 @@ def vtln_warp_freq(
 
 
 def vtln_warp_mel_freq(
-        vtln_low_cutoff: float,
-        vtln_high_cutoff: float,
-        low_freq,
-        high_freq: float,
-        vtln_warp_factor: float,
-        mel_freq: paddle.Tensor, ) -> paddle.Tensor:
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq,
+    high_freq: float,
+    vtln_warp_factor: float,
+    mel_freq: paddle.Tensor,
+) -> paddle.Tensor:
     """
     Args:
         vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
@@ -434,18 +417,21 @@ def vtln_warp_mel_freq(
             low_freq,
             high_freq,
             vtln_warp_factor,
-            inverse_mel_scale(mel_freq), ))
+            inverse_mel_scale(mel_freq),
+        )
+    )
 
 
 def get_mel_banks(
-        num_bins: int,
-        window_length_padded: int,
-        sample_freq: float,
-        low_freq: float,
-        high_freq: float,
-        vtln_low: float,
-        vtln_high: float,
-        vtln_warp_factor: float, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    num_bins: int,
+    window_length_padded: int,
+    sample_freq: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_low: float,
+    vtln_high: float,
+    vtln_warp_factor: float,
+) -> Tuple[paddle.Tensor, paddle.Tensor]:
     """
     Returns:
         (Tensor, Tensor): The tuple consists of ``bins`` (which is
@@ -459,10 +445,8 @@ def get_mel_banks(
     if high_freq <= 0.0:
         high_freq += nyquist
     assert (
-        0.0 <= low_freq < nyquist and 0.0 < high_freq <= nyquist and
-        low_freq < high_freq
-    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(
-        low_freq, high_freq, nyquist)
+        0.0 <= low_freq < nyquist and 0.0 < high_freq <= nyquist and low_freq < high_freq
+    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
     fft_bin_width = sample_freq / window_length_padded
     mel_low_freq = mel_scale_scalar(low_freq)
     mel_high_freq = mel_scale_scalar(high_freq)
@@ -472,72 +456,69 @@ def get_mel_banks(
         vtln_high += nyquist
 
     assert (
-        vtln_warp_factor == 1.0 or low_freq < vtln_low < high_freq and
-        0.0 < vtln_high < high_freq and vtln_low < vtln_high
+        vtln_warp_factor == 1.0
+        or low_freq < vtln_low < high_freq
+        and 0.0 < vtln_high < high_freq
+        and vtln_low < vtln_high
     ), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format(
-        vtln_low, vtln_high, low_freq, high_freq)
+        vtln_low, vtln_high, low_freq, high_freq
+    )
 
     bin = paddle.arange(end=num_bins).unsqueeze(axis=1)
     left_mel = mel_low_freq + bin * mel_freq_delta
     center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta
     right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta
     if vtln_warp_factor != 1.0:
-        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
-                                      vtln_warp_factor, left_mel)
-        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
-                                        high_freq, vtln_warp_factor, center_mel)
-        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
-                                       vtln_warp_factor, right_mel)
+        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
+        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
+        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
     center_freqs = inverse_mel_scale(center_mel)
-    mel = mel_scale(fft_bin_width * paddle.arange(end=num_fft_bins)).unsqueeze(
-        axis=0)
+    mel = mel_scale(fft_bin_width * paddle.arange(end=num_fft_bins)).unsqueeze(axis=0)
     up_slope = (mel - left_mel) / (center_mel - left_mel)
     down_slope = (right_mel - mel) / (right_mel - center_mel)
     if vtln_warp_factor == 1.0:
-        bins = paddle.maximum(
-            x=paddle.zeros(shape=[1]),
-            y=paddle.minimum(
-                x=up_slope, y=down_slope))
+        bins = paddle.maximum(x=paddle.zeros(shape=[1]), y=paddle.minimum(x=up_slope, y=down_slope))
     else:
         bins = paddle.zeros_like(x=up_slope)
-        up_idx = paddle.greater_than(
-            x=mel, y=paddle.to_tensor(left_mel)) & paddle.less_equal(
-                x=mel, y=paddle.to_tensor(center_mel))
-        down_idx = paddle.greater_than(
-            x=mel, y=paddle.to_tensor(center_mel)) & paddle.less_than(
-                x=mel, y=paddle.to_tensor(right_mel))
+        up_idx = paddle.greater_than(x=mel, y=paddle.to_tensor(left_mel)) & paddle.less_equal(
+            x=mel, y=paddle.to_tensor(center_mel)
+        )
+        down_idx = paddle.greater_than(x=mel, y=paddle.to_tensor(center_mel)) & paddle.less_than(
+            x=mel, y=paddle.to_tensor(right_mel)
+        )
         bins[up_idx] = up_slope[up_idx]
         bins[down_idx] = down_slope[down_idx]
     return bins, center_freqs
 
 
 def fbank(
-        waveform: paddle.Tensor,
-        blackman_coeff: float=0.42,
-        channel: int=-1,
-        dither: float=0.0,
-        energy_floor: float=1.0,
-        frame_length: float=25.0,
-        frame_shift: float=10.0,
-        high_freq: float=0.0,
-        htk_compat: bool=False,
-        low_freq: float=20.0,
-        min_duration: float=0.0,
-        num_mel_bins: int=23,
-        preemphasis_coefficient: float=0.97,
-        raw_energy: bool=True,
-        remove_dc_offset: bool=True,
-        round_to_power_of_two: bool=True,
-        sample_frequency: float=16000.0,
-        snip_edges: bool=True,
-        subtract_mean: bool=False,
-        use_energy: bool=False,
-        use_log_fbank: bool=True,
-        use_power: bool=True,
-        vtln_high: float=-500.0,
-        vtln_low: float=100.0,
-        vtln_warp: float=1.0,
-        window_type: str=POVEY, ) -> paddle.Tensor:
+    waveform: paddle.Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    use_log_fbank: bool = True,
+    use_power: bool = True,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> paddle.Tensor:
     """Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
     compute-fbank-feats.
 
@@ -586,18 +567,15 @@ def fbank(
         where m is calculated in _get_strided
     """
     device, dtype = waveform.place, waveform.dtype
-    (
+    (waveform, window_shift, window_size, padded_window_size,) = _get_waveform_and_window_properties(
         waveform,
-        window_shift,
-        window_size,
-        padded_window_size, ) = _get_waveform_and_window_properties(
-            waveform,
-            channel,
-            sample_frequency,
-            frame_shift,
-            frame_length,
-            round_to_power_of_two,
-            preemphasis_coefficient, )
+        channel,
+        sample_frequency,
+        frame_shift,
+        frame_length,
+        round_to_power_of_two,
+        preemphasis_coefficient,
+    )
     if len(waveform) < min_duration * sample_frequency:
         return paddle.empty(shape=[0], dtype=dtype)
 
@@ -613,7 +591,8 @@ def fbank(
         energy_floor,
         dither,
         remove_dc_offset,
-        preemphasis_coefficient, )
+        preemphasis_coefficient,
+    )
     spectrum = paddle.fft.rfft(x=strided_input).abs()
     if use_power:
         spectrum = spectrum.pow(y=2.0)
@@ -625,59 +604,56 @@ def fbank(
         high_freq,
         vtln_low,
         vtln_high,
-        vtln_warp, )
+        vtln_warp,
+    )
     mel_energies = mel_energies
-    mel_energies = paddle.pad_from_torch(
-        mel_energies, (0, 1), mode="constant", value=0)
+    mel_energies = paddle.pad_from_torch(mel_energies, (0, 1), mode="constant", value=0)
     mel_energies = paddle.mm(input=spectrum, mat2=mel_energies.T)
     if use_log_fbank:
-        mel_energies = paddle.maximum(
-            x=mel_energies, y=_get_epsilon(device, dtype)).log()
+        mel_energies = paddle.maximum(x=mel_energies, y=_get_epsilon(device, dtype)).log()
     if use_energy:
         signal_log_energy = signal_log_energy.unsqueeze(axis=1)
         if htk_compat:
-            mel_energies = paddle.concat(
-                x=(mel_energies, signal_log_energy), axis=1)
+            mel_energies = paddle.concat(x=(mel_energies, signal_log_energy), axis=1)
         else:
-            mel_energies = paddle.concat(
-                x=(signal_log_energy, mel_energies), axis=1)
+            mel_energies = paddle.concat(x=(signal_log_energy, mel_energies), axis=1)
     mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
     return mel_energies
 
 
 def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> paddle.Tensor:
     i = paddle.arange(end=num_ceps)
-    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(x=math.pi * i /
-                                                    cepstral_lifter)
+    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(x=math.pi * i / cepstral_lifter)
 
 
 def mfcc(
-        waveform: paddle.Tensor,
-        blackman_coeff: float=0.42,
-        cepstral_lifter: float=22.0,
-        channel: int=-1,
-        dither: float=0.0,
-        energy_floor: float=1.0,
-        frame_length: float=25.0,
-        frame_shift: float=10.0,
-        high_freq: float=0.0,
-        htk_compat: bool=False,
-        low_freq: float=20.0,
-        num_ceps: int=13,
-        min_duration: float=0.0,
-        num_mel_bins: int=23,
-        preemphasis_coefficient: float=0.97,
-        raw_energy: bool=True,
-        remove_dc_offset: bool=True,
-        round_to_power_of_two: bool=True,
-        sample_frequency: float=16000.0,
-        snip_edges: bool=True,
-        subtract_mean: bool=False,
-        use_energy: bool=False,
-        vtln_high: float=-500.0,
-        vtln_low: float=100.0,
-        vtln_warp: float=1.0,
-        window_type: str=POVEY, ) -> paddle.Tensor:
+    waveform: paddle.Tensor,
+    blackman_coeff: float = 0.42,
+    cepstral_lifter: float = 22.0,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    num_ceps: int = 13,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> paddle.Tensor:
     """Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
     compute-mfcc-feats.
 
@@ -725,11 +701,11 @@ def mfcc(
         Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
         where m is calculated in _get_strided
     """
-    assert (num_ceps <= num_mel_bins
-            ), "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (
-                num_ceps,
-                num_mel_bins, )
-    device, dtype = waveform.place, waveform.dtype
+    assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (
+        num_ceps,
+        num_mel_bins,
+    )
+    # device, dtype = waveform.place, waveform.dtype
     feature = fbank(
         waveform=waveform,
         blackman_coeff=blackman_coeff,
@@ -756,17 +732,17 @@ def mfcc(
         vtln_high=vtln_high,
         vtln_low=vtln_low,
         vtln_warp=vtln_warp,
-        window_type=window_type, )
+        window_type=window_type,
+    )
     if use_energy:
         signal_log_energy = feature[:, (num_mel_bins if htk_compat else 0)]
         mel_offset = int(not htk_compat)
-        feature = feature[:, mel_offset:num_mel_bins + mel_offset]
+        feature = feature[:, mel_offset : num_mel_bins + mel_offset]
 
     dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins)
     feature = feature.matmul(y=dct_matrix)
     if cepstral_lifter != 0.0:
-        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(
-            axis=0)
+        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(axis=0)
 
         feature *= lifter_coeffs
     if use_energy:
diff --git a/paddlemix/models/imagebind/utils/paddle_aux.py b/paddlemix/models/imagebind/utils/paddle_aux.py
index 6ffcfe6590f6e..0ea8c1927317d 100644
--- a/paddlemix/models/imagebind/utils/paddle_aux.py
+++ b/paddlemix/models/imagebind/utils/paddle_aux.py
@@ -59,10 +59,10 @@ def to(self, *args, **kwargs):
             if isinstance(kwargs["x"], paddle.dtype):
                 dtype = kwargs["x"]
             elif isinstance(kwargs["x"], str) and kwargs["x"] not in [
-                    "cpu",
-                    "cuda",
-                    "ipu",
-                    "xpu",
+                "cpu",
+                "cuda",
+                "ipu",
+                "xpu",
             ]:
                 dtype = kwargs["x"]
             elif isinstance(kwargs["x"], paddle.Tensor):
@@ -78,8 +78,7 @@ def to(self, *args, **kwargs):
                 if x not in ["cpu", "cuda", "ipu", "xpu"]:
                     dtype = kwargs["x"]
                 else:
-                    dtype = kwargs["y"] if isinstance(kwargs["y"],
-                                                      str) else self.dtype
+                    dtype = kwargs["y"] if isinstance(kwargs["y"], str) else self.dtype
             else:
                 dtype = kwargs["x"]
             return paddle.cast(self, dtype)
@@ -99,11 +98,9 @@ def split(self, *args, **kwargs):
     elif kwargs:
         if "dim" in kwargs:
             kwargs["axis"] = kwargs.pop("dim")
-            kwargs["num_or_sections"] = self.shape[kwargs[
-                "axis"]] // kwargs.pop("split_size")
+            kwargs["num_or_sections"] = self.shape[kwargs["axis"]] // kwargs.pop("split_size")
         else:
-            kwargs["num_or_sections"] = self.shape[0] // kwargs.pop(
-                "split_size")
+            kwargs["num_or_sections"] = self.shape[0] // kwargs.pop("split_size")
         return paddle.split(self, **kwargs)
 
 
@@ -116,7 +113,7 @@ def i0(self, input):
     K = paddle.arange(0, 20).astype("float32")
     m = 0
     for k in K:
-        m += ((input**2) / 4)**k / math.factorial(k)**2
+        m += ((input**2) / 4) ** k / math.factorial(k) ** 2
     return m
 
 
@@ -128,7 +125,7 @@ def i0(self, input):
 def stride(self, dim):
     shape = self.shape
     shape.append(1)
-    return paddle.cumprod(paddle.to_tensor(shape)[dim + 1:], dim=0)[-1].item()
+    return paddle.cumprod(paddle.to_tensor(shape)[dim + 1 :], dim=0)[-1].item()
 
 
 setattr(paddle.Tensor, "stride", stride)
@@ -144,14 +141,20 @@ def as_strided(self, size, stride):
     hh = paddle.expand(h, (dx, dy)).flatten(0)
     datas = []
     for i in range(0, size[0] * stride[0], stride[0]):
-        axes = [0, ]
-        starts = [i, ]
-        ends = [stride[1] * size[1] + i, ]
-        strides = [stride[1], ]
-        new_x = paddle.strided_slice(
-            ww, axes=axes, starts=starts, ends=ends, strides=strides)
-        new_y = paddle.strided_slice(
-            hh, axes=axes, starts=starts, ends=ends, strides=strides)
+        axes = [
+            0,
+        ]
+        starts = [
+            i,
+        ]
+        ends = [
+            stride[1] * size[1] + i,
+        ]
+        strides = [
+            stride[1],
+        ]
+        new_x = paddle.strided_slice(ww, axes=axes, starts=starts, ends=ends, strides=strides)
+        new_y = paddle.strided_slice(hh, axes=axes, starts=starts, ends=ends, strides=strides)
         datas.append(self[new_y, new_x])
     return paddle.stack(datas)
 
@@ -163,19 +166,15 @@ def hann_window(window_length, periodic=True, **kwargs):
     N = window_length
     x = paddle.arange(N)
     if periodic:
-        return paddle.sin(math.pi * x / (N))**2
+        return paddle.sin(math.pi * x / (N)) ** 2
     else:
-        return paddle.sin(math.pi * x / (N - 1))**2
+        return paddle.sin(math.pi * x / (N - 1)) ** 2
 
 
 setattr(paddle, "hann_window", hann_window)
 
 
-def hamming_window(window_length,
-                   periodic=True,
-                   alpha=0.54,
-                   beta=0.46,
-                   **kwargs):
+def hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, **kwargs):
     N = window_length
     x = paddle.arange(N)
     if periodic:
@@ -189,17 +188,14 @@ def hamming_window(window_length,
 
 def pad(input, pad, mode="constant", value=0.0):
     data_formats = {3: "NCL", 4: "NCHW", 5: "NCDHW"}
-    shape = input.shape
     if input.dim() == 2:
         input = input.unsqueeze(0)
     n = len(input.shape)
     pad = list(pad) + [0] * (n - 3) * 2
-    pad = pad[:(n - 2) * 2]
-    return paddle.nn.functional.pad(input,
-                                    pad=tuple(pad),
-                                    mode=mode,
-                                    value=value,
-                                    data_format=data_formats[n]).squeeze()
+    pad = pad[: (n - 2) * 2]
+    return paddle.nn.functional.pad(
+        input, pad=tuple(pad), mode=mode, value=value, data_format=data_formats[n]
+    ).squeeze()
 
 
 setattr(paddle, "pad_from_torch", pad)
diff --git a/paddlemix/models/imagebind/utils/resample.py b/paddlemix/models/imagebind/utils/resample.py
index 10cc433bf7d66..c048c3df74fb9 100644
--- a/paddlemix/models/imagebind/utils/resample.py
+++ b/paddlemix/models/imagebind/utils/resample.py
@@ -13,29 +13,28 @@
 # limitations under the License.
 
 import math
-import sys
-from typing import List, Optional, Tuple, Union
+from typing import Optional
 
 import paddle
 
 
 def _get_sinc_resample_kernel(
-        orig_freq: int,
-        new_freq: int,
-        gcd: int,
-        lowpass_filter_width: int=6,
-        rolloff: float=0.99,
-        resampling_method: str="sinc_interpolation",
-        beta: Optional[float]=None,
-        device: str=str("cpu").replace("cuda", "gpu"),
-        dtype: Optional[paddle.dtype]=None, ):
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interpolation",
+    beta: Optional[float] = None,
+    device: str = str("cpu").replace("cuda", "gpu"),
+    dtype: Optional[paddle.dtype] = None,
+):
     if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
         raise Exception(
             "Frequencies must be of integer type to ensure quality resampling computation. To work around this, manually convert both frequencies to integer values that maintain their resampling rate ratio before passing them into the function. Example: To downsample a 44100 hz waveform by a factor of 8, use `orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`."
         )
     if resampling_method not in ["sinc_interpolation", "kaiser_window"]:
-        raise ValueError("Invalid resampling method: {}".format(
-            resampling_method))
+        raise ValueError("Invalid resampling method: {}".format(resampling_method))
     orig_freq = int(orig_freq) // gcd
     new_freq = int(new_freq) // gcd
     assert lowpass_filter_width > 0
@@ -49,54 +48,41 @@ def _get_sinc_resample_kernel(
         t = (-i / new_freq + idx / orig_freq) * base_freq
         t = t.clip_(min=-lowpass_filter_width, max=lowpass_filter_width)
         if resampling_method == "sinc_interpolation":
-            window = paddle.cos(x=t * math.pi / lowpass_filter_width / 2)**2
+            window = paddle.cos(x=t * math.pi / lowpass_filter_width / 2) ** 2
         else:
             if beta is None:
                 beta = 14.769656459379492
             beta_tensor = paddle.to_tensor(data=float(beta))
-            window = paddle.i0(beta_tensor * paddle.sqrt(
-                x=1 - (t / lowpass_filter_width)**2)) / paddle.i0(beta_tensor)
+            window = paddle.i0(beta_tensor * paddle.sqrt(x=1 - (t / lowpass_filter_width) ** 2)) / paddle.i0(
+                beta_tensor
+            )
         t *= math.pi
         # breakpoint()
-        kernel = paddle.where(
-            condition=t == 0,
-            x=paddle.to_tensor(data=1.0),
-            y=paddle.sin(x=t) / t)
+        kernel = paddle.where(condition=t == 0, x=paddle.to_tensor(data=1.0), y=paddle.sin(x=t) / t)
         paddle.assign(paddle.multiply(kernel, window), kernel)
         #  kernel.scale_(scale=window)
         kernels.append(kernel)
     scale = base_freq / orig_freq
 
-    kernels = paddle.stack(x=kernels).reshape(
-        (new_freq, 1, -1)).scale_(scale=scale)
+    kernels = paddle.stack(x=kernels).reshape((new_freq, 1, -1)).scale_(scale=scale)
     if dtype is None:
         kernels = kernels.to(dtype="float32")
     return kernels, width
 
 
-def _apply_sinc_resample_kernel(waveform,
-                                orig_freq: int,
-                                new_freq: int,
-                                gcd: int,
-                                kernel,
-                                width: int):
+def _apply_sinc_resample_kernel(waveform, orig_freq: int, new_freq: int, gcd: int, kernel, width: int):
     if not waveform.is_floating_point():
-        raise TypeError(
-            f"Expected floating point type for waveform tensor, but received {waveform.dtype}."
-        )
+        raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.")
     orig_freq = int(orig_freq) // gcd
     new_freq = int(new_freq) // gcd
     shape = waveform.shape
 
     waveform = waveform.reshape((-1, shape[-1]))
     num_wavs, length = waveform.shape
-    waveform = paddle.nn.functional.pad(waveform.unsqueeze(1),
-                                        (width, width + orig_freq),
-                                        data_format="NCL").squeeze()
+    waveform = paddle.nn.functional.pad(waveform.unsqueeze(1), (width, width + orig_freq), data_format="NCL").squeeze()
     if waveform.dim() == 1:
         waveform = waveform.unsqueeze(0)
-    resampled = paddle.nn.functional.conv1d(
-        x=waveform[:, (None)], weight=kernel, stride=orig_freq)
+    resampled = paddle.nn.functional.conv1d(x=waveform[:, (None)], weight=kernel, stride=orig_freq)
     x = resampled
     perm_0 = list(range(x.ndim))
     perm_0[1] = 2
@@ -110,13 +96,14 @@ def _apply_sinc_resample_kernel(waveform,
 
 
 def resample(
-        waveform,
-        orig_freq: int,
-        new_freq: int,
-        lowpass_filter_width: int=6,
-        rolloff: float=0.99,
-        resampling_method: str="sinc_interpolation",
-        beta: Optional[float]=None, ):
+    waveform,
+    orig_freq: int,
+    new_freq: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interpolation",
+    beta: Optional[float] = None,
+):
     """Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`].
 
     .. devices:: CPU CUDA
@@ -155,8 +142,8 @@ def resample(
         resampling_method,
         beta,
         waveform.place,
-        waveform.dtype, )
+        waveform.dtype,
+    )
 
-    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd,
-                                            kernel, width)
+    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width)
     return resampled
diff --git a/paddlemix/models/minigpt4/configuration.py b/paddlemix/models/minigpt4/configuration.py
index 49587bb85c845..6eac08f0366d5 100644
--- a/paddlemix/models/minigpt4/configuration.py
+++ b/paddlemix/models/minigpt4/configuration.py
@@ -16,8 +16,7 @@
 import os
 from typing import Union
 
-from paddlenlp.transformers.auto.modeling import \
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from paddlenlp.transformers.auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 from paddlenlp.transformers.llama.configuration import LlamaConfig
 
@@ -74,23 +73,24 @@ class MiniGPT4VisionConfig(PretrainedConfig):
     model_type = "mimigpt4_vision_model"
 
     def __init__(
-            self,
-            hidden_size=1408,
-            intermediate_size=6144,
-            projection_dim=512,
-            num_hidden_layers=39,
-            num_attention_heads=16,
-            num_channels=3,
-            image_size=224,
-            patch_size=14,
-            hidden_act="gelu",
-            layer_norm_eps=0.00001,
-            dropout=0.0,
-            attention_dropout=0.0,
-            initializer_range=1e-10,
-            initializer_factor=1.0,
-            qkv_bias=True,
-            **kwargs, ):
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        projection_dim=512,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -111,17 +111,13 @@ def __init__(
         self.qkv_bias = qkv_bias
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the vision config dict if we are loading from MiniGPT4Config
         if config_dict.get("model_type") == "minigpt4":
             config_dict = config_dict["vision_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -188,24 +184,25 @@ class MiniGPT4QFormerConfig(PretrainedConfig):
     model_type = "minigpt4_qformer"
 
     def __init__(
-            self,
-            vocab_size=30522,
-            hidden_size=768,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            intermediate_size=3072,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            initializer_range=0.02,
-            layer_norm_eps=1e-12,
-            pad_token_id=0,
-            position_embedding_type="absolute",
-            classifier_dropout=None,
-            cross_attention_frequency=2,
-            encoder_hidden_size=1408,
-            **kwargs, ):
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
         self.vocab_size = vocab_size
@@ -225,18 +222,14 @@ def __init__(
         self.encoder_hidden_size = encoder_hidden_size
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from MiniGPT4Config
         if config_dict.get("model_type") == "minigpt4":
             config_dict = config_dict["qformer_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -290,58 +283,50 @@ class MiniGPT4Config(PretrainedConfig):
     is_composition = True
 
     def __init__(
-            self,
-            vision_config=None,
-            qformer_config=None,
-            text_config=None,
-            num_query_tokens=32,
-            **kwargs, ):
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         if vision_config is None:
             vision_config = {}
-            logger.info(
-                "vision_config is None. initializing the MiniGPT4VisionConfig with default values."
-            )
+            logger.info("vision_config is None. initializing the MiniGPT4VisionConfig with default values.")
 
         if qformer_config is None:
             qformer_config = {}
-            logger.info(
-                "qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values."
-            )
+            logger.info("qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values.")
 
         if text_config is None:
             text_config = {}
-            logger.info(
-                "text_config is None. Initializing the text config with default values (`LlamaConfig`)."
-            )
+            logger.info("text_config is None. Initializing the text config with default values (`LlamaConfig`).")
         self.vision_config = MiniGPT4VisionConfig(**vision_config)
         self.qformer_config = MiniGPT4QFormerConfig(**qformer_config)
-        text_model_type = (text_config["model_type"]
-                           if "model_type" in text_config else "llama")
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "llama"
 
         if text_model_type == "llama":
             self.text_config = LlamaConfig(**text_config)
         else:
-            raise ValueError(
-                "Only llama accepted for model_type, but accepted {}.".format(
-                    text_model_type))
+            raise ValueError("Only llama accepted for model_type, but accepted {}.".format(text_model_type))
 
         self.num_query_tokens = num_query_tokens
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
-        self.use_decoder_only_language_model = (
-            self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
         self.initializer_factor = 1.0
         self.initializer_range = 0.02
 
     @classmethod
     def from_vision_qformer_text_configs(
-            cls,
-            vision_config: MiniGPT4VisionConfig,
-            qformer_config: MiniGPT4QFormerConfig,
-            text_config: PretrainedConfig,
-            **kwargs, ):
+        cls,
+        vision_config: MiniGPT4VisionConfig,
+        qformer_config: MiniGPT4QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
         r"""
         Instantiate a [`MiniGPT4Config`] (or a derived class) from a vision model, Q-Former and language model
         configurations.
@@ -353,7 +338,8 @@ def from_vision_qformer_text_configs(
             vision_config=vision_config.to_dict(),
             qformer_config=qformer_config.to_dict(),
             text_config=text_config.to_dict(),
-            **kwargs, )
+            **kwargs,
+        )
 
     def to_dict(self):
         """
diff --git a/paddlemix/models/minigpt4/modeling.py b/paddlemix/models/minigpt4/modeling.py
index 85ef2a007f2b0..4652192427488 100644
--- a/paddlemix/models/minigpt4/modeling.py
+++ b/paddlemix/models/minigpt4/modeling.py
@@ -23,12 +23,18 @@
 from paddle.nn import CrossEntropyLoss
 from paddlenlp.transformers.llama.modeling import LlamaForCausalLM
 from paddlenlp.transformers.model_outputs import (
-    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions,
-    ModelOutput)
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
 from paddlenlp.transformers.model_utils import (
-    PretrainedModel, apply_chunking_to_forward,
-    find_pruneable_heads_and_indices, prune_linear_layer)
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
 
 from ...activations import ACT2FN
 from ...utils.initializer import normal_, ones_, zeros_
@@ -37,8 +43,7 @@
 
 MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = []
 
-from .configuration import (MiniGPT4Config, MiniGPT4QFormerConfig,
-                            MiniGPT4VisionConfig)
+from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig
 
 __all__ = [
     "MiniGPT4Model",
@@ -53,15 +58,14 @@ def Parameter(tensor):
     return paddle.create_parameter(
         tensor.shape,
         dtype=tensor.dtype,
-        default_initializer=nn.initializer.Assign(tensor), )
+        default_initializer=nn.initializer.Assign(tensor),
+    )
 
 
 def convert_weights_to_dtype(model, dtype: str):
     # trying to convert model dtype if necessary
     if dtype not in ["float16", "float32", "float64"]:
-        raise ValueError(
-            "Not supported dtype: {}., only [float16, float32, float64] supported.".
-            format(dtype))
+        raise ValueError("Not supported dtype: {}., only [float16, float32, float64] supported.".format(dtype))
     dtype_mapping = {
         "float16": paddle.float16,
         "float32": paddle.float32,
@@ -71,12 +75,9 @@ def convert_weights_to_dtype(model, dtype: str):
     def convert_for_vit(layer):
         if isinstance(layer, (nn.Linear, nn.Conv1D, nn.Conv2D)):
             if layer.weight.dtype != dtype_mapping[dtype]:
-                layer.weight = transfer_param(
-                    layer.weight, restore_data=True, dtype=dtype)
-            if layer.bias is not None and layer.bias.dtype != dtype_mapping[
-                    dtype]:
-                layer.bias = transfer_param(
-                    layer.bias, restore_data=True, dtype=dtype)
+                layer.weight = transfer_param(layer.weight, restore_data=True, dtype=dtype)
+            if layer.bias is not None and layer.bias.dtype != dtype_mapping[dtype]:
+                layer.bias = transfer_param(layer.bias, restore_data=True, dtype=dtype)
 
     if isinstance(model, MiniGPT4VisionModel):
         model.apply(convert_for_vit)
@@ -111,9 +112,11 @@ class MiniGPT4ForConditionalGenerationModelOutput(ModelOutput):
 
     def to_tuple(self) -> Tuple[Any]:
         return tuple(
-            self[k] if k not in
-            ["vision_outputs", "qformer_outputs", "language_model_outputs"] else
-            getattr(self, k).to_tuple() for k in self.keys())
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
 
 
 class MiniGPT4PretrainedModel(PretrainedModel):
@@ -125,13 +128,14 @@ class MiniGPT4PretrainedModel(PretrainedModel):
     config_class = MiniGPT4Config
     base_model_prefix = "minigpt4"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids", ]
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+    ]
 
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_range
-        if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or
-                isinstance(module, nn.Linear)):
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
             normal_(module.weight, mean=0.0, std=factor)
             if hasattr(module, "bias") and module.bias is not None:
                 zeros_(module.bias)
@@ -141,7 +145,9 @@ def _init_weights(self, module):
                 factor = self.config.vision_config.initializer_range
             trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
             trunc_normal_(module.position_embedding)
-            trunc_normal_(module.class_embedding, )
+            trunc_normal_(
+                module.class_embedding,
+            )
         elif isinstance(module, nn.LayerNorm):
             zeros_(module.bias)
             ones_(module.weight)
@@ -154,12 +160,13 @@ def _set_gradient_checkpointing(self, module, value=False):
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path,
-            from_hf_hub: bool=False,
-            subfolder: str=None,
-            *args,
-            **kwargs, ):
+        cls,
+        pretrained_model_name_or_path,
+        from_hf_hub: bool = False,
+        subfolder: str = None,
+        *args,
+        **kwargs,
+    ):
         vit_dtype = kwargs.pop("vit_dtype", "float16")
         qformer_dtype = kwargs.pop("qformer_dtype", "float32")
         llama_dtype = kwargs.pop("llama_dtype", "float16")
@@ -169,10 +176,10 @@ def from_pretrained(
             from_hf_hub=from_hf_hub,
             subfolder=subfolder,
             *args,
-            **kwargs, )
+            **kwargs,
+        )
 
-        logger.info(
-            "Trying to convert dtype for MiniGPT4 model, it may take a while.")
+        logger.info("Trying to convert dtype for MiniGPT4 model, it may take a while.")
         if isinstance(model, (MiniGPT4Model, MiniGPT4ForConditionalGeneration)):
             convert_weights_to_dtype(model.vision_model, dtype=vit_dtype)
             convert_weights_to_dtype(model.qformer, dtype=qformer_dtype)
@@ -203,30 +210,26 @@ def __init__(self, config: MiniGPT4VisionConfig):
             in_channels=3,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
-            stride=self.patch_size, )
+            stride=self.patch_size,
+        )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
 
-        self.position_embedding = Parameter(
-            paddle.randn([1, self.num_positions, self.embed_dim]))
+        self.position_embedding = Parameter(paddle.randn([1, self.num_positions, self.embed_dim]))
 
     def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(
-            pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
         patch_embeds_shape = paddle.shape(patch_embeds)
         patch_embeds = paddle.reshape(
-            patch_embeds,
-            shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1]).transpose(
-                [0, 2, 1])
+            patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1]
+        ).transpose([0, 2, 1])
 
-        class_embeds = self.class_embedding.expand(
-            [batch_size, 1, -1]).cast(target_dtype)
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
         embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
-        embeddings = embeddings + self.position_embedding[:, :embeddings.shape[
-            1], :].cast(target_dtype)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
         return embeddings
 
 
@@ -242,13 +245,13 @@ def __init__(self, config):
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
         self.dropout = nn.Dropout(config.attention_dropout)
 
         # small tweak here compared to CLIP, no bias here
-        self.qkv = nn.Linear(
-            self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
 
         if config.qkv_bias:
             q_bias = Parameter(paddle.zeros([self.embed_dim]))
@@ -258,41 +261,37 @@ def __init__(self, config):
             v_bias = None
 
         if q_bias is not None:
-            qkv_bias = paddle.concat(
-                (q_bias, paddle.zeros_like(v_bias), v_bias))
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
             self.qkv.bias = Parameter(qkv_bias)
 
         self.projection = nn.Linear(self.embed_dim, self.embed_dim)
 
     def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
-        return tensor.reshape(
-            [bsz, seq_len, self.num_heads, self.head_dim]).transpose(
-                [0, 2, 1, 3])
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            head_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=False, ) -> Tuple[
-                paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[
-                    paddle.Tensor]]]:
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
         bsz, tgt_len, embed_dim = hidden_states.shape
 
         mixed_qkv = self.qkv(hidden_states)
 
-        mixed_qkv = mixed_qkv.reshape(
-            [bsz, tgt_len, 3, self.num_heads,
-             embed_dim // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
         query_states, key_states, value_states = (
             mixed_qkv[0],
             mixed_qkv[1],
-            mixed_qkv[2], )
+            mixed_qkv[2],
+        )
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = paddle.matmul(
-            query_states, key_states, transpose_y=True)
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
 
         attention_scores = attention_scores * self.scale
 
@@ -307,16 +306,16 @@ def forward(
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
 
-        context_layer = paddle.matmul(attention_probs, value_states).transpose(
-            [0, 2, 1, 3])
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
 
-        new_context_layer_shape = context_layer.shape[:-2] + [self.embed_dim, ]
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
         context_layer = context_layer.reshape(new_context_layer_shape)
 
         output = self.projection(context_layer)
 
-        outputs = (output, attention_probs) if output_attentions else (output,
-                                                                       None)
+        outputs = (output, attention_probs) if output_attentions else (output, None)
 
         return outputs
 
@@ -341,17 +340,16 @@ def __init__(self, config: MiniGPT4Config):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = MiniGPT4Attention(config)
-        self.layer_norm1 = nn.LayerNorm(
-            self.embed_dim, epsilon=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
         self.mlp = MiniGPT4MLP(config)
-        self.layer_norm2 = nn.LayerNorm(
-            self.embed_dim, epsilon=config.layer_norm_eps)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            attention_mask: paddle.Tensor,
-            output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]:
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
         """
         Args:
             hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -368,7 +366,8 @@ def forward(
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             head_mask=attention_mask,
-            output_attentions=output_attentions, )
+            output_attentions=output_attentions,
+        )
         hidden_states = hidden_states + residual
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -376,10 +375,10 @@ def forward(
 
         hidden_states = hidden_states + residual
 
-        outputs = (hidden_states, )
+        outputs = (hidden_states,)
 
         if output_attentions:
-            outputs += (attn_weights, )
+            outputs += (attn_weights,)
 
         return outputs
 
@@ -396,20 +395,17 @@ class MiniGPT4Encoder(nn.Layer):
     def __init__(self, config: MiniGPT4Config):
         super().__init__()
         self.config = config
-        self.layers = nn.LayerList([
-            MiniGPT4EncoderLayer(config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.LayerList([MiniGPT4EncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            inputs_embeds,
-            attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[Tuple,
-                                                         BaseModelOutput]:
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
             inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -430,13 +426,11 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -444,7 +438,7 @@ def forward(
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states, )
+                encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
 
                 def create_custom_forward(module):
@@ -456,29 +450,30 @@ def custom_forward(*inputs):
                 layer_outputs = recompute(
                     create_custom_forward(encoder_layer),
                     hidden_states,
-                    attention_mask, )
+                    attention_mask,
+                )
             else:
                 layer_outputs = encoder_layer(
                     hidden_states,
                     attention_mask,
-                    output_attentions=output_attentions, )
+                    output_attentions=output_attentions,
+                )
 
             hidden_states = layer_outputs[0]
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1], )
+                all_attentions = all_attentions + (layer_outputs[1],)
 
         if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states, )
+            encoder_states = encoder_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(
-                v for v in [hidden_states, encoder_states, all_attentions]
-                if v is not None)
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
             last_hidden_state=hidden_states,
             hidden_states=encoder_states,
-            attentions=all_attentions, )
+            attentions=all_attentions,
+        )
 
 
 class MiniGPT4VisionModel(MiniGPT4PretrainedModel):
@@ -492,26 +487,23 @@ def __init__(self, config: MiniGPT4VisionConfig):
 
         self.embeddings = MiniGPT4VisionEmbeddings(config)
         self.encoder = MiniGPT4Encoder(config)
-        self.post_layernorm = nn.LayerNorm(
-            embed_dim, epsilon=config.layer_norm_eps)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
 
     def forward(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, BaseModelOutputWithPooling]:
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
@@ -522,7 +514,8 @@ def forward(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.post_layernorm(last_hidden_state)
@@ -537,7 +530,8 @@ def forward(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions, )
+            attentions=encoder_outputs.attentions,
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -547,35 +541,29 @@ class MiniGPT4QFormerMultiHeadAttention(nn.Layer):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
         self.config = config
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, "embedding_size"):
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
-                % (config.hidden_size, config.num_attention_heads))
+                % (config.hidden_size, config.num_attention_heads)
+            )
 
         self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size /
-                                       config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
         if is_cross_attention:
             self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
-            self.value = nn.Linear(config.encoder_hidden_size,
-                                   self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
         else:
             self.key = nn.Linear(config.hidden_size, self.all_head_size)
             self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(
-            config, "position_embedding_type", "absolute")
-        if (self.position_embedding_type == "relative_key" or
-                self.position_embedding_type == "relative_key_query"):
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
         self.save_attention = False
 
     def save_attn_gradients(self, attn_gradients):
@@ -599,31 +587,29 @@ def transpose_for_scores(self, x):
         return x.transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
         # such that the encoder's padding tokens are not attended to.
         is_cross_attention = encoder_hidden_states is not None
 
         if is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
-            value_layer = paddle.concat(
-                [past_key_value[1], value_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
         else:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -635,37 +621,25 @@ def forward(
         past_key_value = (key_layer, value_layer)
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = paddle.matmul(
-            query_layer, key_layer, transpose_y=True)
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
 
-        if (self.position_embedding_type == "relative_key" or
-                self.position_embedding_type == "relative_key_query"):
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             seq_length = hidden_states.shape[1]
-            position_ids_l = paddle.arange(
-                seq_length, dtype="int64").reshape([-1, 1])
-            position_ids_r = paddle.arange(
-                seq_length, dtype="int64").reshape([1, -1])
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
             distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.cast(
-                dtype=query_layer.dtype)  # fp16 compatibility
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
 
             if self.position_embedding_type == "relative_key":
-                relative_position_scores = paddle.einsum(
-                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = paddle.einsum(
-                    "bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = paddle.einsum(
-                    "bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = (
-                    attention_scores + relative_position_scores_query +
-                    relative_position_scores_key)
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
@@ -694,10 +668,9 @@ def forward(
         ]
         context_layer = context_layer.reshape(new_context_layer_shape)
 
-        outputs = ((context_layer, attention_probs)
-                   if output_attentions else (context_layer, ))
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        outputs = outputs + (past_key_value, )
+        outputs = outputs + (past_key_value,)
         return outputs
 
 
@@ -705,12 +678,10 @@ class MiniGPT4QFormerSelfOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states: paddle.Tensor,
-                input_tensor: paddle.Tensor) -> paddle.Tensor:
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
@@ -720,8 +691,7 @@ def forward(self, hidden_states: paddle.Tensor,
 class MiniGPT4QFormerAttention(nn.Layer):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
-        self.attention = MiniGPT4QFormerMultiHeadAttention(config,
-                                                           is_cross_attention)
+        self.attention = MiniGPT4QFormerMultiHeadAttention(config, is_cross_attention)
         self.output = MiniGPT4QFormerSelfOutput(config)
         self.pruned_heads = set()
 
@@ -732,7 +702,8 @@ def prune_heads(self, heads):
             heads,
             self.attention.num_attention_heads,
             self.attention.attention_head_size,
-            self.pruned_heads, )
+            self.pruned_heads,
+        )
 
         # Prune linear layers
         self.attention.query = prune_linear_layer(self.attention.query, index)
@@ -741,21 +712,20 @@ def prune_heads(self, heads):
         self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
 
         # Update hyper params and store pruned heads
-        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
-            heads)
-        self.attention.all_head_size = (self.attention.attention_head_size *
-                                        self.attention.num_attention_heads)
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            attention_mask: Optional[paddle.Tensor]=None,
-            head_mask: Optional[paddle.Tensor]=None,
-            encoder_hidden_states: Optional[paddle.Tensor]=None,
-            encoder_attention_mask: Optional[paddle.Tensor]=None,
-            past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]]=None,
-            output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]:
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
         self_outputs = self.attention(
             hidden_states,
             attention_mask,
@@ -763,10 +733,10 @@ def forward(
             encoder_hidden_states,
             encoder_attention_mask,
             past_key_value,
-            output_attentions, )
+            output_attentions,
+        )
         attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
 
@@ -789,12 +759,10 @@ class MiniGPT4QFormerOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states: paddle.Tensor,
-                input_tensor: paddle.Tensor) -> paddle.Tensor:
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
@@ -811,8 +779,7 @@ def __init__(self, config, layer_idx):
         self.layer_idx = layer_idx
 
         if layer_idx % config.cross_attention_frequency == 0:
-            self.crossattention = MiniGPT4QFormerAttention(
-                config, is_cross_attention=True)
+            self.crossattention = MiniGPT4QFormerAttention(config, is_cross_attention=True)
             self.has_cross_attention = True
         else:
             self.has_cross_attention = False
@@ -821,24 +788,25 @@ def __init__(self, config, layer_idx):
         self.output_query = MiniGPT4QFormerOutput(config)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False,
-            query_length=0, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = (past_key_value[:2]
-                                    if past_key_value is not None else None)
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         self_attention_outputs = self.attention(
             hidden_states,
             attention_mask,
             head_mask,
             output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value, )
+            past_key_value=self_attn_past_key_value,
+        )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:-1]
 
@@ -849,16 +817,15 @@ def forward(
 
             if self.has_cross_attention:
                 if encoder_hidden_states is None:
-                    raise ValueError(
-                        "encoder_hidden_states must be given for cross-attention layers"
-                    )
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
                 cross_attention_outputs = self.crossattention(
                     query_attention_output,
                     attention_mask,
                     head_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
-                    output_attentions=output_attentions, )
+                    output_attentions=output_attentions,
+                )
                 query_attention_output = cross_attention_outputs[0]
                 # add cross attentions if we output attention weights
                 outputs = outputs + cross_attention_outputs[1:-1]
@@ -867,25 +834,27 @@ def forward(
                 self.feed_forward_chunk_query,
                 self.chunk_size_feed_forward,
                 self.seq_len_dim,
-                query_attention_output, )
+                query_attention_output,
+            )
 
             if attention_output.shape[1] > query_length:
                 layer_output_text = apply_chunking_to_forward(
                     self.feed_forward_chunk,
                     self.chunk_size_feed_forward,
                     self.seq_len_dim,
-                    attention_output[:, query_length:, :], )
-                layer_output = paddle.concat(
-                    [layer_output, layer_output_text], axis=1)
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
         else:
             layer_output = apply_chunking_to_forward(
                 self.feed_forward_chunk,
                 self.chunk_size_feed_forward,
                 self.seq_len_dim,
-                attention_output, )
-        outputs = (layer_output, ) + outputs
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
 
-        outputs = outputs + (present_key_value, )
+        outputs = outputs + (present_key_value,)
 
         return outputs
 
@@ -904,25 +873,25 @@ class MiniGPT4QFormerEncoder(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.LayerList([
-            MiniGPT4QFormerLayer(config, layer_idx)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.LayerList(
+            [MiniGPT4QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
-            query_length=0, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions else None
@@ -932,14 +901,12 @@ def forward(
         for i in range(self.config.num_hidden_layers):
             layer_module = self.layer[i]
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
 
-            if getattr(self.config, "gradient_checkpointing",
-                       False) and self.training:
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
                 if use_cache:
                     logger.warn(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -948,8 +915,7 @@ def forward(
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions, query_length)
+                        return module(*inputs, past_key_value, output_attentions, query_length)
 
                     return custom_forward
 
@@ -959,7 +925,8 @@ def custom_forward(*inputs):
                     attention_mask,
                     layer_head_mask,
                     encoder_hidden_states,
-                    encoder_attention_mask, )
+                    encoder_attention_mask,
+                )
             else:
                 layer_outputs = layer_module(
                     hidden_states,
@@ -969,35 +936,39 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                     past_key_value,
                     output_attentions,
-                    query_length, )
+                    query_length,
+                )
 
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
+                next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1], )
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                 if layer_module.has_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v
-                         for v in [
-                             hidden_states,
-                             next_decoder_cache,
-                             all_hidden_states,
-                             all_self_attentions,
-                             all_cross_attentions,
-                         ] if v is not None)
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
         return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions, )
+            cross_attentions=all_cross_attentions,
+        )
 
 
 class MiniGPT4QFormerModel(MiniGPT4PretrainedModel):
@@ -1009,8 +980,7 @@ def __init__(self, config: MiniGPT4QFormerConfig):
         super().__init__(config)
         self.config = config
 
-        self.layernorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         self.encoder = MiniGPT4QFormerEncoder(config)
@@ -1030,10 +1000,11 @@ class PreTrainedModel
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def get_extended_attention_mask(
-            self,
-            attention_mask: paddle.Tensor,
-            input_shape: Tuple[int],
-            has_query: bool=False, ) -> paddle.Tensor:
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
         Arguments:
@@ -1054,21 +1025,21 @@ def get_extended_attention_mask(
             extended_attention_mask = attention_mask[:, None, None, :]
         else:
             raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".
-                format(input_shape, attention_mask.shape))
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.cast(
-            dtype=self.layernorm.weight.dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.layernorm.weight.dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         return extended_attention_mask
 
-    def invert_attention_mask(
-            self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
         """
         Invert an attention mask (e.g., switches 0. and 1.).
         Args:
@@ -1077,26 +1048,25 @@ def invert_attention_mask(
             `paddle.Tensor`: The inverted attention mask.
         """
         if encoder_attention_mask.ndim == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:,
-                                                                     None, :, :]
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
         if encoder_attention_mask.ndim == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None,
-                                                                     None, :]
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
         # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
         # /transformer/transformer_layers.py#L270
         encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
-            dtype=self.layernorm.weight.dtype)  # fp16 compatibility
-        encoder_extended_attention_mask = (
-            1.0 - encoder_extended_attention_mask) * -1e4
+            dtype=self.layernorm.weight.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
 
         return encoder_extended_attention_mask
 
     def get_head_mask(
-            self,
-            head_mask: Optional[paddle.Tensor],
-            num_hidden_layers: int,
-            is_attention_chunked: bool=False, ) -> paddle.Tensor:
+        self,
+        head_mask: Optional[paddle.Tensor],
+        num_hidden_layers: int,
+        is_attention_chunked: bool = False,
+    ) -> paddle.Tensor:
         """
         Prepare the head mask if needed.
         Args:
@@ -1111,8 +1081,7 @@ def get_head_mask(
             `[None]` for each layer.
         """
         if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask,
-                                                      num_hidden_layers)
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
             if is_attention_chunked is True:
                 head_mask = head_mask.unsqueeze(-1)
         else:
@@ -1123,30 +1092,27 @@ def get_head_mask(
     def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
         """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
         if head_mask.ndim == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-                -1).unsqueeze(-1)
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
             head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
         elif head_mask.ndim == 2:
-            head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                         )  # We can specify head_mask for each layer
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
         assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.cast(
-            dtype=self.config.
-            dtype)  # switch to float if need + fp16 compatibility
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
         return head_mask
 
     def forward(
-            self,
-            query_embeds,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None, ):
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
         r"""
         encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -1166,62 +1132,52 @@ def forward(
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # past_key_values_length
         past_key_values_length = (
-            past_key_values[0][0].shape[2] - self.config.query_length
-            if past_key_values is not None else 0)
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
 
         query_length = query_embeds.shape[1] if query_embeds is not None else 0
 
-        embedding_output = self.layernorm(
-            query_embeds.cast(self.layernorm.weight.dtype))
+        embedding_output = self.layernorm(query_embeds.cast(self.layernorm.weight.dtype))
         embedding_output = self.dropout(embedding_output)
 
         input_shape = embedding_output.shape[:-1]
         batch_size, seq_length = input_shape
 
         if attention_mask is None:
-            attention_mask = paddle.ones((
-                (batch_size, seq_length + past_key_values_length)))
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(
-            attention_mask, input_shape)
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
 
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if encoder_hidden_states is not None:
             if type(encoder_hidden_states) == list:
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
-                    0].shape
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
             else:
                 (
                     encoder_batch_size,
                     encoder_sequence_length,
-                    _, ) = encoder_hidden_states.shape
+                    _,
+                ) = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
 
             if type(encoder_attention_mask) == list:
-                encoder_extended_attention_mask = [
-                    self.invert_attention_mask(mask)
-                    for mask in encoder_attention_mask
-                ]
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
             elif encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(encoder_hidden_shape)
-                encoder_extended_attention_mask = self.invert_attention_mask(
-                    encoder_attention_mask)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
             else:
-                encoder_extended_attention_mask = self.invert_attention_mask(
-                    encoder_attention_mask)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
 
@@ -1243,7 +1199,8 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            query_length=query_length, )
+            query_length=query_length,
+        )
         sequence_output = encoder_outputs[0]
         pooled_output = sequence_output[:, 0, :]
 
@@ -1256,7 +1213,8 @@ def forward(
             past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions, )
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
 
 
 class MiniGPT4Model(MiniGPT4PretrainedModel):
@@ -1268,27 +1226,24 @@ def __init__(self, config: MiniGPT4Config):
 
         self.vision_model = MiniGPT4VisionModel(config.vision_config)
 
-        self.query_tokens = Parameter(
-            paddle.zeros([
-                1, config.num_query_tokens, config.qformer_config.hidden_size
-            ]))
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
         self.qformer = MiniGPT4QFormerModel(config.qformer_config)
 
-        self.language_projection = nn.Linear(config.qformer_config.hidden_size,
-                                             config.text_config.hidden_size)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
         self.language_model = LlamaForCausalLM(config.text_config)
 
     def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
     def get_text_features(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs, ):
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
         r"""
         Returns:
             text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
@@ -1306,30 +1261,30 @@ def get_text_features(
         >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False)
         >>> text_features = model.get_text_features(**inputs)
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_outputs = self.language_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         return text_outputs
 
     def get_image_features(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs, ):
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
         r"""
         Returns:
             vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
@@ -1349,32 +1304,30 @@ def get_image_features(
         >>> inputs = processor.process_images(images=image, return_tensors="pd")
         >>> image_outputs = model.get_image_features(**inputs)
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         return vision_outputs
 
     def get_qformer_features(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs, ):
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
         r"""
         Returns:
             vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
@@ -1394,56 +1347,51 @@ def get_qformer_features(
         >>> inputs = processor.process_images(images=image, return_tensors="pd")
         >>> qformer_outputs = model.get_qformer_features(**inputs)
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         image_embeds = vision_outputs[0]
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True, )
+            return_dict=True,
+        )
 
         return query_outputs
 
     def forward(
-            self,
-            pixel_values: paddle.Tensor,  # processed image
-            first_input_ids: paddle.Tensor,
-            second_input_ids: paddle.Tensor,
-            first_attention_mask: Optional[paddle.Tensor]=None,
-            second_attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            labels: Optional[paddle.Tensor]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
         r"""
         Returns:
         Examples:
@@ -1461,69 +1409,61 @@ def forward(
         >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
         >>> outputs = model(**inputs)
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(pixel_values, return_dict=True)
         image_embeds = vision_outputs.last_hidden_state
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         # step 3: use the language model, conditioned on the text and image
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
 
         first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
         second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
-        language_model_inputs = paddle.cast(
-            language_model_inputs, dtype=first_embeds.dtype)
-        inputs_embeds = paddle.concat(
-            [first_embeds, language_model_inputs, second_embeds], axis=1)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
 
         if first_attention_mask is None:
-            first_attention_mask = paddle.ones(
-                first_embeds.shape[:-1], dtype="int64")
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
         if second_attention_mask is None:
-            second_attention_mask = paddle.ones(
-                second_embeds.shape[:-1], dtype="int64")
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
         attention_mask = paddle.concat(
             [
                 first_attention_mask,
                 language_model_attention_mask,
                 second_attention_mask,
             ],
-            axis=1, )
+            axis=1,
+        )
 
         outputs = self.language_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         logits = outputs.logits if return_dict else outputs[0]
         loss = None
         # we compute the loss here since we need to take into account the sequence length of the query embeds
         if labels is not None:
-            logits = logits[:, -labels.shape[1]:, :]
+            logits = logits[:, -labels.shape[1] :, :]
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :]
             shift_labels = labels[..., 1:]
@@ -1533,18 +1473,20 @@ def forward(
 
             loss = loss_fct(
                 shift_logits.reshape([-1, self.config.text_config.vocab_size]),
-                shift_labels.reshape([-1]), )
+                shift_labels.reshape([-1]),
+            )
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
-            return ((loss, ) + output) if loss is not None else output
+            return ((loss,) + output) if loss is not None else output
 
         return MiniGPT4ForConditionalGenerationModelOutput(
             loss=loss,
             logits=logits,
             vision_outputs=vision_outputs,
             qformer_outputs=query_outputs,
-            language_model_outputs=outputs, )
+            language_model_outputs=outputs,
+        )
 
 
 class MiniGPT4ForConditionalGeneration(MiniGPT4PretrainedModel):
@@ -1556,30 +1498,26 @@ def __init__(self, config: MiniGPT4Config):
         self.config = config
         self.vision_model = MiniGPT4VisionModel(config.vision_config)
 
-        self.query_tokens = Parameter(
-            paddle.zeros([
-                1, config.num_query_tokens, config.qformer_config.hidden_size
-            ]))
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
         self.qformer = MiniGPT4QFormerModel(config.qformer_config)
-        self.language_projection = nn.Linear(config.qformer_config.hidden_size,
-                                             config.text_config.hidden_size)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
         self.language_model = LlamaForCausalLM(config.text_config)
 
     def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
     def forward(
-            self,
-            pixel_values: paddle.Tensor,  # processed image
-            first_input_ids: paddle.Tensor,
-            second_input_ids: paddle.Tensor,
-            first_attention_mask: Optional[paddle.Tensor]=None,
-            second_attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            labels: Optional[paddle.Tensor]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
         r"""
         Examples:
         ```python
@@ -1596,70 +1534,62 @@ def forward(
         >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
         >>> outputs = model(**inputs)
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(pixel_values, return_dict=True)
         image_embeds = vision_outputs.last_hidden_state
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         # step 3: use the language model, conditioned on the text and image
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
 
         first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
         second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
-        language_model_inputs = paddle.cast(
-            language_model_inputs, dtype=first_embeds.dtype)
-        inputs_embeds = paddle.concat(
-            [first_embeds, language_model_inputs, second_embeds], axis=1)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
 
         if first_attention_mask is None:
-            first_attention_mask = paddle.ones(
-                first_embeds.shape[:-1], dtype="int64")
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
         if second_attention_mask is None:
-            second_attention_mask = paddle.ones(
-                second_embeds.shape[:-1], dtype="int64")
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
         attention_mask = paddle.concat(
             [
                 first_attention_mask,
                 language_model_attention_mask,
                 second_attention_mask,
             ],
-            axis=1, )
+            axis=1,
+        )
 
         outputs = self.language_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         logits = outputs.logits if return_dict else outputs[0]
         loss = None
         # we compute the loss here since we need to take into account the sequence length of the query embeds
         if labels is not None:
-            logits = logits[:, -labels.shape[1]:, :]
+            logits = logits[:, -labels.shape[1] :, :]
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :]
             shift_labels = labels[..., 1:]
@@ -1669,28 +1599,31 @@ def forward(
 
             loss = loss_fct(
                 shift_logits.reshape([-1, self.config.text_config.vocab_size]),
-                shift_labels.reshape([-1]), )
+                shift_labels.reshape([-1]),
+            )
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
-            return ((loss, ) + output) if loss is not None else output
+            return ((loss,) + output) if loss is not None else output
 
         return MiniGPT4ForConditionalGenerationModelOutput(
             loss=loss,
             logits=logits,
             vision_outputs=vision_outputs,
             qformer_outputs=query_outputs,
-            language_model_outputs=outputs, )
+            language_model_outputs=outputs,
+        )
 
     @paddle.no_grad()
     def generate(
-            self,
-            pixel_values: paddle.Tensor,  # processed image
-            first_input_ids: paddle.Tensor,
-            second_input_ids: paddle.Tensor,
-            first_attention_mask: Optional[paddle.Tensor]=None,
-            second_attention_mask: Optional[paddle.Tensor]=None,
-            **generate_kwargs, ) -> paddle.Tensor:
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
         """
         Overrides `generate` function to be able to use the model as a conditional generator.
         Args:
@@ -1725,64 +1658,57 @@ def generate(
         """
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(pixel_values, return_dict=True)
         image_embeds = vision_outputs.last_hidden_state
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         # step 3: use the language model, conditioned on the text and image
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
 
         first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
         second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
-        language_model_inputs = paddle.cast(
-            language_model_inputs, dtype=first_embeds.dtype)
-        inputs_embeds = paddle.concat(
-            [first_embeds, language_model_inputs, second_embeds], axis=1)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
 
         if first_attention_mask is None:
-            first_attention_mask = paddle.ones(
-                first_embeds.shape[:-1], dtype="int64")
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
         if second_attention_mask is None:
-            second_attention_mask = paddle.ones(
-                second_embeds.shape[:-1], dtype="int64")
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
         attention_mask = paddle.concat(
             [
                 first_attention_mask,
                 language_model_attention_mask,
                 second_attention_mask,
             ],
-            axis=1, )
+            axis=1,
+        )
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            **generate_kwargs, )
+            **generate_kwargs,
+        )
 
         return outputs
 
     @paddle.no_grad()
     def encode_images(
-            self,
-            pixel_values: paddle.Tensor,  # processed image
+        self,
+        pixel_values: paddle.Tensor,  # processed image
     ) -> paddle.Tensor:
         """
         Overrides `generate` function to be able to use the model as a conditional generator.
@@ -1807,44 +1733,40 @@ def encode_images(
         """
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(pixel_values, return_dict=True)
         image_embeds = vision_outputs.last_hidden_state
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         # step 3: use the language model, conditioned on the text and image
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
 
         return language_model_inputs, language_model_attention_mask
 
     @paddle.no_grad()
     def generate_with_image_features(
-            self,
-            image_features: paddle.Tensor,
-            first_input_ids: paddle.Tensor,
-            second_input_ids: paddle.Tensor,
-            image_attention_mask: Optional[paddle.Tensor]=None,
-            first_attention_mask: Optional[paddle.Tensor]=None,
-            second_attention_mask: Optional[paddle.Tensor]=None,
-            **generate_kwargs, ) -> paddle.Tensor:
+        self,
+        image_features: paddle.Tensor,
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        image_attention_mask: Optional[paddle.Tensor] = None,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
         """
         Overrides `generate` function to be able to use the model as a conditional generator.
         Args:
@@ -1884,29 +1806,21 @@ def generate_with_image_features(
         first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
         second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
         image_features = paddle.cast(image_features, dtype=first_embeds.dtype)
-        inputs_embeds = paddle.concat(
-            [first_embeds, image_features, second_embeds], axis=1)
+        inputs_embeds = paddle.concat([first_embeds, image_features, second_embeds], axis=1)
 
         if first_attention_mask is None:
-            first_attention_mask = paddle.ones(
-                first_embeds.shape[:-1], dtype="int64")
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
         if second_attention_mask is None:
-            second_attention_mask = paddle.ones(
-                second_embeds.shape[:-1], dtype="int64")
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
         if image_attention_mask is None:
-            image_attention_mask = paddle.ones(
-                image_features.shape[:-1], dtype="int64")
+            image_attention_mask = paddle.ones(image_features.shape[:-1], dtype="int64")
 
-        attention_mask = paddle.concat(
-            [
-                first_attention_mask, image_attention_mask,
-                second_attention_mask
-            ],
-            axis=1)
+        attention_mask = paddle.concat([first_attention_mask, image_attention_mask, second_attention_mask], axis=1)
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            **generate_kwargs, )
+            **generate_kwargs,
+        )
 
         return outputs
diff --git a/paddlemix/models/sam/common.py b/paddlemix/models/sam/common.py
index f0ba1b97e6eab..e8fd2dd038e13 100644
--- a/paddlemix/models/sam/common.py
+++ b/paddlemix/models/sam/common.py
@@ -19,10 +19,7 @@
 
 
 class MLPBlock(nn.Layer):
-    def __init__(self,
-                 embedding_dim: int,
-                 mlp_dim: int,
-                 act: Type[nn.Layer]=nn.GELU) -> None:
+    def __init__(self, embedding_dim: int, mlp_dim: int, act: Type[nn.Layer] = nn.GELU) -> None:
         super().__init__()
         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
@@ -33,16 +30,18 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
 
 
 class LayerNorm2d(nn.Layer):
-    def __init__(self, num_channels: int, eps: float=1e-06) -> None:
+    def __init__(self, num_channels: int, eps: float = 1e-06) -> None:
         super().__init__()
         self.weight = paddle.create_parameter(
             shape=[num_channels],
             dtype="float32",
-            default_initializer=nn.initializer.Constant(value=1.0), )
+            default_initializer=nn.initializer.Constant(value=1.0),
+        )
         self.bias = paddle.create_parameter(
             shape=[num_channels],
             dtype="float32",
-            default_initializer=nn.initializer.Constant(value=0.0), )
+            default_initializer=nn.initializer.Constant(value=0.0),
+        )
         self.eps = eps
 
     def forward(self, x: paddle.Tensor) -> paddle.Tensor:
diff --git a/paddlemix/models/sam/configuration.py b/paddlemix/models/sam/configuration.py
index 9c516c863b5b0..26420a4241f79 100644
--- a/paddlemix/models/sam/configuration.py
+++ b/paddlemix/models/sam/configuration.py
@@ -17,6 +17,8 @@
 
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 
+from paddlemix.utils.log import logger
+
 __all__ = ["SamConfig"]
 
 
@@ -25,16 +27,17 @@ class SamConfig(PretrainedConfig):
     model_type = "Sam"
 
     def __init__(
-            self,
-            modelname="Sam",
-            prompt_embed_dim=256,
-            image_size=1024,
-            vit_patch_size=16,
-            encoder_embed_dim=768,
-            encoder_depth=12,
-            encoder_num_heads=12,
-            encoder_global_attn_indexes=[2, 5, 8, 11],
-            input_type=None, ):
+        self,
+        modelname="Sam",
+        prompt_embed_dim=256,
+        image_size=1024,
+        vit_patch_size=16,
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        input_type=None,
+    ):
         super().__init__()
         self.modelname = modelname
         self.prompt_embed_dim = prompt_embed_dim
@@ -45,18 +48,14 @@ def __init__(
         self.encoder_num_heads = encoder_num_heads
         self.encoder_global_attn_indexes = encoder_global_attn_indexes
         self.input_type = input_type
-        self.pixel_mean = ([123.675, 116.28, 103.53], )
+        self.pixel_mean = ([123.675, 116.28, 103.53],)
         self.pixel_std = [58.395, 57.12, 57.375]
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
diff --git a/paddlemix/models/sam/image_encoder.py b/paddlemix/models/sam/image_encoder.py
index b4ba60fe04aef..a6ba55f1167cb 100644
--- a/paddlemix/models/sam/image_encoder.py
+++ b/paddlemix/models/sam/image_encoder.py
@@ -24,23 +24,24 @@
 
 class ImageEncoderViT(nn.Layer):
     def __init__(
-            self,
-            img_size: int=1024,
-            patch_size: int=16,
-            in_chans: int=3,
-            embed_dim: int=768,
-            depth: int=12,
-            num_heads: int=12,
-            mlp_ratio: float=4.0,
-            out_chans: int=256,
-            qkv_bias: bool=True,
-            norm_layer: Type[nn.Layer]=nn.LayerNorm,
-            act_layer: Type[nn.Layer]=nn.GELU,
-            use_abs_pos: bool=True,
-            use_rel_pos: bool=False,
-            rel_pos_zero_init: bool=True,
-            window_size: int=0,
-            global_attn_indexes: Tuple[int, ...]=(), ) -> None:
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Layer] = nn.LayerNorm,
+        act_layer: Type[nn.Layer] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+    ) -> None:
         """
         Args:
             img_size (int): Input image size.
@@ -66,17 +67,17 @@ def __init__(
             kernel_size=(patch_size, patch_size),
             stride=(patch_size, patch_size),
             in_chans=in_chans,
-            embed_dim=embed_dim, )
+            embed_dim=embed_dim,
+        )
 
         self.pos_embed = None
         if use_abs_pos:
             # Initialize absolute positional embedding with pretrain image size.
             self.pos_embed = paddle.create_parameter(
-                shape=[
-                    1, img_size // patch_size, img_size // patch_size, embed_dim
-                ],
+                shape=[1, img_size // patch_size, img_size // patch_size, embed_dim],
                 dtype="float32",
-                default_initializer=nn.initializer.Constant(value=0.0), )
+                default_initializer=nn.initializer.Constant(value=0.0),
+            )
 
         self.blocks = nn.LayerList()
         for i in range(depth):
@@ -90,7 +91,8 @@ def __init__(
                 use_rel_pos=use_rel_pos,
                 rel_pos_zero_init=rel_pos_zero_init,
                 window_size=window_size if i not in global_attn_indexes else 0,
-                input_size=(img_size // patch_size, img_size // patch_size), )
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
             self.blocks.append(block)
 
         self.neck = nn.Sequential(
@@ -98,15 +100,18 @@ def __init__(
                 embed_dim,
                 out_chans,
                 kernel_size=1,
-                bias_attr=False, ),
+                bias_attr=False,
+            ),
             LayerNorm2d(out_chans),
             nn.Conv2D(
                 out_chans,
                 out_chans,
                 kernel_size=3,
                 padding=1,
-                bias_attr=False, ),
-            LayerNorm2d(out_chans), )
+                bias_attr=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
 
     def forward(self, x: paddle.Tensor) -> paddle.Tensor:
         x = self.patch_embed(x)
@@ -125,17 +130,18 @@ class Block(nn.Layer):
     """Transformer blocks with support of window attention and residual propagation blocks"""
 
     def __init__(
-            self,
-            dim: int,
-            num_heads: int,
-            mlp_ratio: float=4.0,
-            qkv_bias: bool=True,
-            norm_layer=nn.LayerNorm,
-            act_layer=nn.GELU,
-            use_rel_pos: bool=False,
-            rel_pos_zero_init: bool=True,
-            window_size: int=0,
-            input_size: Optional[Tuple[int, int]]=None, ) -> None:
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
         """
         Args:
             dim (int): Number of input channels.
@@ -159,12 +165,11 @@ def __init__(
             qkv_bias=qkv_bias,
             use_rel_pos=use_rel_pos,
             rel_pos_zero_init=rel_pos_zero_init,
-            input_size=input_size
-            if window_size == 0 else (window_size, window_size), )
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
 
         self.norm2 = norm_layer(dim)
-        self.mlp = MLPBlock(
-            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
+        self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
 
         self.window_size = window_size
 
@@ -191,13 +196,14 @@ class Attention(nn.Layer):
     """Multi-head Attention block with relative position embeddings."""
 
     def __init__(
-            self,
-            dim: int,
-            num_heads: int=8,
-            qkv_bias: bool=True,
-            use_rel_pos: bool=False,
-            rel_pos_zero_init: bool=True,
-            input_size: Optional[Tuple[int, int]]=None, ) -> None:
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
         """
         Args:
             dim (int): Number of input channels.
@@ -213,51 +219,45 @@ def __init__(
         head_dim = dim // num_heads
         self.scale = head_dim**-0.5
 
-        self.qkv = nn.Linear(
-            dim, dim * 3, bias_attr=qkv_bias if not qkv_bias else None)
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias if not qkv_bias else None)
         self.proj = nn.Linear(dim, dim)
 
         self.use_rel_pos = use_rel_pos
         if self.use_rel_pos:
-            assert (
-                input_size is not None
-            ), "Input size must be provided if using relative positional encoding."
+            assert input_size is not None, "Input size must be provided if using relative positional encoding."
             # initialize relative positional embeddings
             self.rel_pos_h = paddle.create_parameter(
                 shape=[2 * input_size[0] - 1, head_dim],
                 dtype="float32",
-                default_initializer=nn.initializer.Constant(value=0.0), )
+                default_initializer=nn.initializer.Constant(value=0.0),
+            )
             self.rel_pos_w = paddle.create_parameter(
                 shape=[2 * input_size[1] - 1, head_dim],
                 dtype="float32",
-                default_initializer=nn.initializer.Constant(value=0.0), )
+                default_initializer=nn.initializer.Constant(value=0.0),
+            )
 
     def forward(self, x: paddle.Tensor) -> paddle.Tensor:
         B, H, W, _ = x.shape
         # qkv with shape (3, B, nHead, H * W, C)
-        qkv = (self.qkv(x).reshape(shape=[B, H * W, 3, self.num_heads, -1])
-               .transpose([2, 0, 3, 1, 4]))
+        qkv = self.qkv(x).reshape(shape=[B, H * W, 3, self.num_heads, -1]).transpose([2, 0, 3, 1, 4])
         # q, k, v with shape (B * nHead, H * W, C)
 
-        q, k, v = qkv.reshape(shape=[3, B * self.num_heads, H * W, -1]).unbind(
-            axis=0)
+        q, k, v = qkv.reshape(shape=[3, B * self.num_heads, H * W, -1]).unbind(axis=0)
 
-        attn = (q * self.scale) @k.transpose([0, 2, 1])
+        attn = (q * self.scale) @ k.transpose([0, 2, 1])
 
         if self.use_rel_pos:
-            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h,
-                                          self.rel_pos_w, (H, W), (H, W))
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
 
         attn = F.softmax(attn, axis=-1)
-        x = ((attn @v).reshape([B, self.num_heads, H, W, -1])
-             .transpose([0, 2, 3, 1, 4]).reshape([B, H, W, -1]))
+        x = (attn @ v).reshape([B, self.num_heads, H, W, -1]).transpose([0, 2, 3, 1, 4]).reshape([B, H, W, -1])
         x = self.proj(x)
 
         return x
 
 
-def window_partition(x: paddle.Tensor,
-                     window_size: int) -> Tuple[paddle.Tensor, Tuple[int, int]]:
+def window_partition(x: paddle.Tensor, window_size: int) -> Tuple[paddle.Tensor, Tuple[int, int]]:
     """
     Partition into non-overlapping windows with padding if needed.
     Args:
@@ -273,22 +273,20 @@ def window_partition(x: paddle.Tensor,
     pad_h = (window_size - H % window_size) % window_size
     pad_w = (window_size - W % window_size) % window_size
     if pad_h > 0 or pad_w > 0:
-        x = paddle.nn.functional.pad(
-            x=x, pad=(0, 0, 0, pad_w, 0, pad_h, 0, 0))  # 每个维度分两位数进行pad
+        x = paddle.nn.functional.pad(x=x, pad=(0, 0, 0, pad_w, 0, pad_h, 0, 0))  # 每个维度分两位数进行pad
     Hp, Wp = H + pad_h, W + pad_w
 
-    x = x.reshape(
-        [B, Hp // window_size, window_size, Wp // window_size, window_size, C])
-    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
-        [-1, window_size, window_size, C])
+    x = x.reshape([B, Hp // window_size, window_size, Wp // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C])
     return windows, (Hp, Wp)
 
 
 def window_unpartition(
-        windows: paddle.Tensor,
-        window_size: int,
-        pad_hw: Tuple[int, int],
-        hw: Tuple[int, int], ) -> paddle.Tensor:
+    windows: paddle.Tensor,
+    window_size: int,
+    pad_hw: Tuple[int, int],
+    hw: Tuple[int, int],
+) -> paddle.Tensor:
     """
     Window unpartition into original sequences and removing padding.
     Args:
@@ -303,9 +301,7 @@ def window_unpartition(
     Hp, Wp = pad_hw
     H, W = hw
     B = windows.shape[0] // (Hp * Wp // window_size // window_size)
-    x = windows.reshape([
-        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
-    ])
+    x = windows.reshape([B, Hp // window_size, Wp // window_size, window_size, window_size, -1])
     x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, -1])
 
     if Hp > H or Wp > W:
@@ -313,8 +309,7 @@ def window_unpartition(
     return x
 
 
-def get_rel_pos(q_size: int, k_size: int,
-                rel_pos: paddle.Tensor) -> paddle.Tensor:
+def get_rel_pos(q_size: int, k_size: int, rel_pos: paddle.Tensor) -> paddle.Tensor:
     """
     Get relative positional embeddings according to the relative positions of
         query and key sizes.
@@ -333,31 +328,29 @@ def get_rel_pos(q_size: int, k_size: int,
         rel_pos_resized = F.interpolate(
             rel_pos.reshape(1, rel_pos.shape[0], -1).transpose([0, 2, 1]),
             size=max_rel_dist,
-            mode="linear", )
-        rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]).transpose(
-            [1, 0])
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]).transpose([1, 0])
     else:
         rel_pos_resized = rel_pos
 
     # Scale the coords with short length if shapes for q and k are different.
     q_coords = paddle.arange(end=q_size)[:, None] * max(k_size / q_size, 1.0)
     k_coords = paddle.arange(end=k_size)[None, :] * max(q_size / k_size, 1.0)
-    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size /
-                                                                 k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
     h, w = relative_coords.shape
 
-    return paddle.index_select(rel_pos_resized,
-                               relative_coords.cast("int64").flatten()).reshape(
-                                   (h, w, -1))
+    return paddle.index_select(rel_pos_resized, relative_coords.cast("int64").flatten()).reshape((h, w, -1))
 
 
 def add_decomposed_rel_pos(
-        attn: paddle.Tensor,
-        q: paddle.Tensor,
-        rel_pos_h: paddle.Tensor,
-        rel_pos_w: paddle.Tensor,
-        q_size: Tuple[int, int],
-        k_size: Tuple[int, int], ) -> paddle.Tensor:
+    attn: paddle.Tensor,
+    q: paddle.Tensor,
+    rel_pos_h: paddle.Tensor,
+    rel_pos_w: paddle.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> paddle.Tensor:
     """
     Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
     https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
@@ -382,8 +375,9 @@ def add_decomposed_rel_pos(
     rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh)
     rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw)
 
-    attn = (attn.reshape([B, q_h, q_w, k_h, k_w]) + rel_h[:, :, :, :, None] +
-            rel_w[:, :, :, None, :]).reshape([B, q_h * q_w, k_h * k_w])
+    attn = (attn.reshape([B, q_h, q_w, k_h, k_w]) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape(
+        [B, q_h * q_w, k_h * k_w]
+    )
 
     return attn
 
@@ -394,12 +388,13 @@ class PatchEmbed(nn.Layer):
     """
 
     def __init__(
-            self,
-            kernel_size: Tuple[int, int]=(16, 16),
-            stride: Tuple[int, int]=(16, 16),
-            padding: Tuple[int, int]=(0, 0),
-            in_chans: int=3,
-            embed_dim: int=768, ) -> None:
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
         """
         Args:
             kernel_size (Tuple): kernel size of the projection layer.
@@ -410,12 +405,7 @@ def __init__(
         """
         super().__init__()
 
-        self.proj = nn.Conv2D(
-            in_chans,
-            embed_dim,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding)
+        self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
 
     def forward(self, x: paddle.Tensor) -> paddle.Tensor:
         x = self.proj(x)
@@ -429,48 +419,44 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
     import paddle
     import torch
     from padiff import auto_diff
-    from segment_anything.modeling import \
-        ImageEncoderViT as ImageEncoderViT_torch
+    from segment_anything.modeling import ImageEncoderViT as ImageEncoderViT_torch
 
     image_encoder_t = ImageEncoderViT_torch(
         depth=12,
         embed_dim=768,
         img_size=1024,
         mlp_ratio=4,
-        norm_layer=partial(
-            torch.nn.LayerNorm, eps=1e-6),
+        norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
         num_heads=12,
         patch_size=16,
         qkv_bias=True,
         use_rel_pos=True,
         global_attn_indexes=[2, 5, 8, 11],
         window_size=14,
-        out_chans=256, )
+        out_chans=256,
+    )
 
     image_encoder = ImageEncoderViT(
         depth=12,
         embed_dim=768,
         img_size=1024,
         mlp_ratio=4,
-        norm_layer=partial(
-            paddle.nn.LayerNorm, epsilon=1e-6),
+        norm_layer=partial(paddle.nn.LayerNorm, epsilon=1e-6),
         num_heads=12,
         patch_size=16,
         qkv_bias=True,
         use_rel_pos=True,
         global_attn_indexes=[2, 5, 8, 11],
         window_size=14,
-        out_chans=256, )
+        out_chans=256,
+    )
 
     # Generate random numbers of shape (4, 3, 128, 128)
     random_numbers = np.random.rand(1, 3, 1024, 1024).astype("float32")
     inp = (
-        {
-            "x": paddle.to_tensor(random_numbers)
-        },
-        {
-            "x": torch.as_tensor(random_numbers)
-        }, )
+        {"x": paddle.to_tensor(random_numbers)},
+        {"x": torch.as_tensor(random_numbers)},
+    )
 
     auto_diff(
         image_encoder,
@@ -482,4 +468,5 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
             "rtol": 0,
             "compare_mode": "mean",
             "single_step": False,
-        }, )
+        },
+    )
diff --git a/paddlemix/models/sam/mask_decoder.py b/paddlemix/models/sam/mask_decoder.py
index e3aa90acd5671..babf266b92ef6 100644
--- a/paddlemix/models/sam/mask_decoder.py
+++ b/paddlemix/models/sam/mask_decoder.py
@@ -20,14 +20,16 @@
 
 
 class MaskDecoder(paddle.nn.Layer):
-    def __init__(self,
-                 *,
-                 transformer_dim: int,
-                 transformer: paddle.nn.Layer,
-                 num_multimask_outputs: int=3,
-                 activation: Type[paddle.nn.Layer]=paddle.nn.GELU,
-                 iou_head_depth: int=3,
-                 iou_head_hidden_dim: int=256) -> None:
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: paddle.nn.Layer,
+        num_multimask_outputs: int = 3,
+        activation: Type[paddle.nn.Layer] = paddle.nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256
+    ) -> None:
         """
         Predicts masks given an image and prompt embeddings, using a
         tranformer architecture.
@@ -50,36 +52,39 @@ def __init__(self,
         self.num_multimask_outputs = num_multimask_outputs
         self.iou_token = paddle.nn.Embedding(1, transformer_dim)
         self.num_mask_tokens = num_multimask_outputs + 1
-        self.mask_tokens = paddle.nn.Embedding(self.num_mask_tokens,
-                                               transformer_dim)
+        self.mask_tokens = paddle.nn.Embedding(self.num_mask_tokens, transformer_dim)
         self.output_upscaling = paddle.nn.Sequential(
             paddle.nn.Conv2DTranspose(
                 in_channels=transformer_dim,
                 out_channels=transformer_dim // 4,
                 kernel_size=2,
-                stride=2, ),
+                stride=2,
+            ),
             LayerNorm2d(transformer_dim // 4),
             activation(),
             paddle.nn.Conv2DTranspose(
                 in_channels=transformer_dim // 4,
                 out_channels=transformer_dim // 8,
                 kernel_size=2,
-                stride=2, ),
-            activation(), )
-        self.output_hypernetworks_mlps = paddle.nn.LayerList(sublayers=[
-            MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
-            for i in range(self.num_mask_tokens)
-        ])
-        self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim,
-                                       self.num_mask_tokens, iou_head_depth)
+                stride=2,
+            ),
+            activation(),
+        )
+        self.output_hypernetworks_mlps = paddle.nn.LayerList(
+            sublayers=[
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth)
 
     def forward(
-            self,
-            image_embeddings: paddle.Tensor,
-            image_pe: paddle.Tensor,
-            sparse_prompt_embeddings: paddle.Tensor,
-            dense_prompt_embeddings: paddle.Tensor,
-            multimask_output: bool, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        self,
+        image_embeddings: paddle.Tensor,
+        image_pe: paddle.Tensor,
+        sparse_prompt_embeddings: paddle.Tensor,
+        dense_prompt_embeddings: paddle.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """
         Predict masks given image and prompt embeddings.
 
@@ -99,7 +104,8 @@ def forward(
             image_embeddings=image_embeddings,
             image_pe=image_pe,
             sparse_prompt_embeddings=sparse_prompt_embeddings,
-            dense_prompt_embeddings=dense_prompt_embeddings, )
+            dense_prompt_embeddings=dense_prompt_embeddings,
+        )
 
         if multimask_output:
             mask_slice = slice(1, None)
@@ -111,28 +117,24 @@ def forward(
         return masks, iou_pred
 
     def predict_masks(
-            self,
-            image_embeddings: paddle.Tensor,
-            image_pe: paddle.Tensor,
-            sparse_prompt_embeddings: paddle.Tensor,
-            dense_prompt_embeddings: paddle.Tensor, ) -> Tuple[paddle.Tensor,
-                                                               paddle.Tensor]:
+        self,
+        image_embeddings: paddle.Tensor,
+        image_pe: paddle.Tensor,
+        sparse_prompt_embeddings: paddle.Tensor,
+        dense_prompt_embeddings: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Predicts masks. See 'forward' for more details."""
-        output_tokens = paddle.concat(
-            x=[self.iou_token.weight, self.mask_tokens.weight], axis=0)
-        output_tokens = output_tokens.unsqueeze(axis=0).expand(
-            shape=[sparse_prompt_embeddings.shape[0], -1, -1])
-        tokens = paddle.concat(
-            x=(output_tokens, sparse_prompt_embeddings), axis=1)
-        src = paddle.repeat_interleave(
-            image_embeddings, tokens.shape[0], axis=0)
+        output_tokens = paddle.concat(x=[self.iou_token.weight, self.mask_tokens.weight], axis=0)
+        output_tokens = output_tokens.unsqueeze(axis=0).expand(shape=[sparse_prompt_embeddings.shape[0], -1, -1])
+        tokens = paddle.concat(x=(output_tokens, sparse_prompt_embeddings), axis=1)
+        src = paddle.repeat_interleave(image_embeddings, tokens.shape[0], axis=0)
         src = src + dense_prompt_embeddings
         pos_src = paddle.repeat_interleave(image_pe, tokens.shape[0], axis=0)
         b, c, h, w = src.shape
         hs, src = self.transformer(src, pos_src, tokens)
 
         iou_token_out = hs[:, (0), :]
-        mask_tokens_out = hs[:, 1:1 + self.num_mask_tokens, :]
+        mask_tokens_out = hs[:, 1 : 1 + self.num_mask_tokens, :]
         x = src
         perm_0 = list(range(x.ndim))
         perm_0[1] = 2
@@ -142,37 +144,37 @@ def predict_masks(
         upscaled_embedding = self.output_upscaling(src)
         hyper_in_list: List[paddle.Tensor] = []
         for i in range(self.num_mask_tokens):
-            hyper_in_list.append(self.output_hypernetworks_mlps[i](
-                mask_tokens_out[:, (i), :]))
+            hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, (i), :]))
         hyper_in = paddle.stack(x=hyper_in_list, axis=1)
         b, c, h, w = upscaled_embedding.shape
 
-        masks = (hyper_in @upscaled_embedding.reshape([b, c, h * w])).reshape(
-            [b, -1, h, w])
+        masks = (hyper_in @ upscaled_embedding.reshape([b, c, h * w])).reshape([b, -1, h, w])
         iou_pred = self.iou_prediction_head(iou_token_out)
         return masks, iou_pred
 
 
 class MLP(paddle.nn.Layer):
     def __init__(
-            self,
-            input_dim: int,
-            hidden_dim: int,
-            output_dim: int,
-            num_layers: int,
-            sigmoid_output: bool=False, ) -> None:
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
         super().__init__()
         self.num_layers = num_layers
         h = [hidden_dim] * (num_layers - 1)
-        self.layers = paddle.nn.LayerList(sublayers=(paddle.nn.Linear(
-            in_features=n,
-            out_features=k) for n, k in zip([input_dim] + h, h + [output_dim])))
+        self.layers = paddle.nn.LayerList(
+            sublayers=(
+                paddle.nn.Linear(in_features=n, out_features=k) for n, k in zip([input_dim] + h, h + [output_dim])
+            )
+        )
         self.sigmoid_output = sigmoid_output
 
     def forward(self, x):
         for i, layer in enumerate(self.layers):
-            x = (paddle.nn.functional.relu(x=layer(x))
-                 if i < self.num_layers - 1 else layer(x))
+            x = paddle.nn.functional.relu(x=layer(x)) if i < self.num_layers - 1 else layer(x)
         if self.sigmoid_output:
             x = paddle.nn.functional.sigmoid(x=x)
         return x
diff --git a/paddlemix/models/sam/modeling.py b/paddlemix/models/sam/modeling.py
index 6d82a9626d4fb..2d64e17c302b0 100644
--- a/paddlemix/models/sam/modeling.py
+++ b/paddlemix/models/sam/modeling.py
@@ -13,14 +13,11 @@
 # limitations under the License.
 
 from functools import partial
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 import numpy as np
 import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddlenlp.transformers.model_utils import (PretrainedModel,
-                                                register_base_model)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
 
 from .configuration import SamConfig
 from .image_encoder import ImageEncoderViT
@@ -65,39 +62,44 @@ def __init__(self, config: SamConfig):
             embed_dim=config.encoder_embed_dim,
             img_size=image_size,
             mlp_ratio=4,
-            norm_layer=partial(
-                paddle.nn.LayerNorm, epsilon=1e-6),
+            norm_layer=partial(paddle.nn.LayerNorm, epsilon=1e-6),
             num_heads=config.encoder_num_heads,
             patch_size=vit_patch_size,
             qkv_bias=True,
             use_rel_pos=True,
             global_attn_indexes=config.encoder_global_attn_indexes,
             window_size=14,
-            out_chans=prompt_embed_dim, )
+            out_chans=prompt_embed_dim,
+        )
         self.prompt_encoder = PromptEncoder(
             embed_dim=prompt_embed_dim,
             image_embedding_size=(image_embedding_size, image_embedding_size),
             input_image_size=(image_size, image_size),
-            mask_in_chans=16, )
+            mask_in_chans=16,
+        )
         self.mask_decoder = MaskDecoder(
             num_multimask_outputs=3,
             transformer=TwoWayTransformer(
                 depth=2,
                 embedding_dim=prompt_embed_dim,
                 mlp_dim=2048,
-                num_heads=8, ),
+                num_heads=8,
+            ),
             transformer_dim=prompt_embed_dim,
             iou_head_depth=3,
-            iou_head_hidden_dim=256, )
+            iou_head_hidden_dim=256,
+        )
         self.eval()
         self.register_buffer(
             "pixel_mean",
             paddle.to_tensor(config.pixel_mean).reshape([-1, 1, 1]),
-            persistable=False, )
+            persistable=False,
+        )
         self.register_buffer(
             "pixel_std",
             paddle.to_tensor(config.pixel_std).reshape([-1, 1, 1]),
-            persistable=False, )
+            persistable=False,
+        )
 
     @property
     def device(self) -> Any:
@@ -111,9 +113,10 @@ def reset_img(self):
         self.set_image = False
 
     def after_forward(self):
-        masks = masks[0].detach().cpu().numpy()
-        iou_predictions = iou_predictions[0].detach().cpu().numpy()
-        low_res_masks = low_res_masks[0].detach().cpu().numpy()
+        # masks = masks[0].detach().cpu().numpy()
+        # iou_predictions = iou_predictions[0].detach().cpu().numpy()
+        # low_res_masks = low_res_masks[0].detach().cpu().numpy()
+        pass
 
     @paddle.no_grad()
     def prompt_forward_point(self, x=None, coords_paddle=None):
@@ -132,7 +135,8 @@ def prompt_forward_point(self, x=None, coords_paddle=None):
         sparse_embeddings, dense_embeddings = self.prompt_encoder(
             points=points,
             boxes=None,
-            masks=None, )
+            masks=None,
+        )
 
         # Predict masks
         low_res_masks, iou_predictions = self.mask_decoder(
@@ -140,7 +144,8 @@ def prompt_forward_point(self, x=None, coords_paddle=None):
             image_pe=self.prompt_encoder.get_dense_pe(),
             sparse_prompt_embeddings=sparse_embeddings,
             dense_prompt_embeddings=dense_embeddings,
-            multimask_output=False, )
+            multimask_output=False,
+        )
 
         return low_res_masks
 
@@ -155,7 +160,8 @@ def prompt_forward_box(self, x=None, box_paddle=None):
         sparse_embeddings, dense_embeddings = self.prompt_encoder(
             points=None,
             boxes=box_paddle,
-            masks=None, )
+            masks=None,
+        )
 
         # Predict masks
         low_res_masks, iou_predictions = self.mask_decoder(
@@ -163,15 +169,19 @@ def prompt_forward_box(self, x=None, box_paddle=None):
             image_pe=self.prompt_encoder.get_dense_pe(),
             sparse_prompt_embeddings=sparse_embeddings,
             dense_prompt_embeddings=dense_embeddings,
-            multimask_output=False, )
+            multimask_output=False,
+        )
 
         return low_res_masks  # , iou_predictions, low_res_masks
 
     @paddle.no_grad()
     def full_mask_forward(self, img: List[Dict[str, Any]], coords_paddle):
         labels_paddle = paddle.ones(
-            shape=[coords_paddle.shape[0], ],
-            dtype="int64", )
+            shape=[
+                coords_paddle.shape[0],
+            ],
+            dtype="int64",
+        )
         labels_paddle = paddle.to_tensor(labels_paddle).cast("int32")[:, None]
 
         points = (coords_paddle, labels_paddle)
@@ -183,7 +193,8 @@ def full_mask_forward(self, img: List[Dict[str, Any]], coords_paddle):
         sparse_embeddings, dense_embeddings = self.prompt_encoder(
             points=points,
             boxes=None,
-            masks=None, )
+            masks=None,
+        )
 
         # Predict masks
         low_res_masks, iou_predictions = self.mask_decoder(
@@ -191,7 +202,8 @@ def full_mask_forward(self, img: List[Dict[str, Any]], coords_paddle):
             image_pe=self.prompt_encoder.get_dense_pe(),
             sparse_prompt_embeddings=sparse_embeddings,
             dense_prompt_embeddings=dense_embeddings,
-            multimask_output=False, )
+            multimask_output=False,
+        )
 
         return low_res_masks, iou_predictions  # (64, 3) # low_res_masks,
 
@@ -205,7 +217,7 @@ def forward(self, img=None, prompt=None):
             return masks, iou_predictions
         else:
             NotImplementedError(
-                'input_type need to be in {"points", "boxs", "points_grid"}, but got: {}'.
-                format(self.input_type))
+                'input_type need to be in ["points", "boxs", "points_grid"], but got: {}'.format(self.input_type)
+            )
 
         return masks
diff --git a/paddlemix/models/sam/prompt_encoder.py b/paddlemix/models/sam/prompt_encoder.py
index ff51c6dd571e6..168bad62aa992 100644
--- a/paddlemix/models/sam/prompt_encoder.py
+++ b/paddlemix/models/sam/prompt_encoder.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional, Tuple, Type
+from typing import Optional, Tuple, Type
 
 import numpy as np
 import paddle
@@ -22,12 +22,13 @@
 
 class PromptEncoder(paddle.nn.Layer):
     def __init__(
-            self,
-            embed_dim: int,
-            image_embedding_size: Tuple[int, int],
-            input_image_size: Tuple[int, int],
-            mask_in_chans: int,
-            activation: Type[paddle.nn.Layer]=paddle.nn.GELU, ) -> None:
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[paddle.nn.Layer] = paddle.nn.GELU,
+    ) -> None:
         """
         Encodes prompts for input to SAM's mask decoder.
 
@@ -48,33 +49,24 @@ def __init__(
         self.image_embedding_size = image_embedding_size
         self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
         self.num_point_embeddings: int = 4
-        point_embeddings = [
-            paddle.nn.Embedding(1, embed_dim)
-            for i in range(self.num_point_embeddings)
-        ]
+        point_embeddings = [paddle.nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
         self.point_embeddings = paddle.nn.LayerList(sublayers=point_embeddings)
         self.not_a_point_embed = paddle.nn.Embedding(1, embed_dim)
-        self.mask_input_size = 4 * image_embedding_size[
-            0], 4 * image_embedding_size[1]
+        self.mask_input_size = 4 * image_embedding_size[0], 4 * image_embedding_size[1]
         self.mask_downscaling = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=1,
-                out_channels=mask_in_chans // 4,
-                kernel_size=2,
-                stride=2),
+            paddle.nn.Conv2D(in_channels=1, out_channels=mask_in_chans // 4, kernel_size=2, stride=2),
             LayerNorm2d(mask_in_chans // 4),
             activation(),
             paddle.nn.Conv2D(
                 in_channels=mask_in_chans // 4,
                 out_channels=mask_in_chans,
                 kernel_size=2,
-                stride=2, ),
+                stride=2,
+            ),
             LayerNorm2d(mask_in_chans),
             activation(),
-            paddle.nn.Conv2D(
-                in_channels=mask_in_chans,
-                out_channels=embed_dim,
-                kernel_size=1), )
+            paddle.nn.Conv2D(in_channels=mask_in_chans, out_channels=embed_dim, kernel_size=1),
+        )
         self.no_mask_embed = paddle.nn.Embedding(1, embed_dim)
 
     def get_dense_pe(self) -> paddle.Tensor:
@@ -88,10 +80,7 @@ def get_dense_pe(self) -> paddle.Tensor:
         """
         return self.pe_layer(self.image_embedding_size).unsqueeze(axis=0)
 
-    def _embed_points(self,
-                      points: paddle.Tensor,
-                      labels: paddle.Tensor,
-                      pad: bool) -> paddle.Tensor:
+    def _embed_points(self, points: paddle.Tensor, labels: paddle.Tensor, pad: bool) -> paddle.Tensor:
         """Embeds point prompts."""
         points = points + 0.5
         points = points.cast("float32")
@@ -99,10 +88,8 @@ def _embed_points(self,
             padding_point = paddle.zeros(shape=(points.shape[0], 1, 2))
             padding_label = -paddle.ones(shape=(labels.shape[0], 1))
             points = paddle.concat(x=[points, padding_point], axis=1)
-            labels = paddle.concat(
-                x=[labels.astype("float32"), padding_label], axis=1)
-        point_embedding = self.pe_layer.forward_with_coords(
-            points, self.input_image_size)
+            labels = paddle.concat(x=[labels.astype("float32"), padding_label], axis=1)
+        point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
 
         point_embedding[labels == -1] = 0.0
         if point_embedding[labels == -1].shape[0] != 0:
@@ -117,8 +104,7 @@ def _embed_boxes(self, boxes: paddle.Tensor) -> paddle.Tensor:
         """Embeds box prompts."""
         boxes = boxes + 0.5
         coords = boxes.reshape([-1, 2, 2])
-        corner_embedding = self.pe_layer.forward_with_coords(
-            coords, self.input_image_size)
+        corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
         corner_embedding[:, (0), :] += self.point_embeddings[2].weight
         corner_embedding[:, (1), :] += self.point_embeddings[3].weight
         return corner_embedding
@@ -129,10 +115,11 @@ def _embed_masks(self, masks: paddle.Tensor) -> paddle.Tensor:
         return mask_embedding
 
     def _get_batch_size(
-            self,
-            points: Optional[Tuple[paddle.Tensor, paddle.Tensor]],
-            boxes: Optional[paddle.Tensor],
-            masks: Optional[paddle.Tensor], ) -> int:
+        self,
+        points: Optional[Tuple[paddle.Tensor, paddle.Tensor]],
+        boxes: Optional[paddle.Tensor],
+        masks: Optional[paddle.Tensor],
+    ) -> int:
         """
         Gets the batch size of the output given the batch size of the input prompts.
         """
@@ -149,11 +136,11 @@ def _get_device(self):
         return self.point_embeddings[0].weight.place
 
     def forward(
-            self,
-            points: Optional[Tuple[paddle.Tensor, paddle.Tensor]],
-            boxes: Optional[paddle.Tensor],
-            masks: Optional[paddle.Tensor], ) -> Tuple[paddle.Tensor,
-                                                       paddle.Tensor]:
+        self,
+        points: Optional[Tuple[paddle.Tensor, paddle.Tensor]],
+        boxes: Optional[paddle.Tensor],
+        masks: Optional[paddle.Tensor],
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """
         Embeds different types of prompts, returning both sparse and dense
         embeddings.
@@ -175,24 +162,22 @@ def forward(
         sparse_embeddings = paddle.empty(shape=(bs, 0, self.embed_dim))
         if points is not None:
             coords, labels = points
-            point_embeddings = self._embed_points(
-                coords, labels, pad=boxes is None)
-            sparse_embeddings = paddle.concat(
-                x=[sparse_embeddings, point_embeddings], axis=1)
+            point_embeddings = self._embed_points(coords, labels, pad=boxes is None)
+            sparse_embeddings = paddle.concat(x=[sparse_embeddings, point_embeddings], axis=1)
         if boxes is not None:
             box_embeddings = self._embed_boxes(boxes)
-            sparse_embeddings = paddle.concat(
-                x=[sparse_embeddings, box_embeddings], axis=1)
+            sparse_embeddings = paddle.concat(x=[sparse_embeddings, box_embeddings], axis=1)
         if masks is not None:
             dense_embeddings = self._embed_masks(masks)
         else:
-            dense_embeddings = self.no_mask_embed.weight.reshape(
-                [1, -1, 1, 1]).expand(shape=[
+            dense_embeddings = self.no_mask_embed.weight.reshape([1, -1, 1, 1]).expand(
+                shape=[
                     bs,
                     -1,
                     self.image_embedding_size[0],
                     self.image_embedding_size[1],
-                ])
+                ]
+            )
         return sparse_embeddings, dense_embeddings
 
 
@@ -201,27 +186,26 @@ class PositionEmbeddingRandom(paddle.nn.Layer):
     Positional encoding using random spatial frequencies.
     """
 
-    def __init__(self, num_pos_feats: int=64,
-                 scale: Optional[float]=None) -> None:
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
         super().__init__()
         if scale is None or scale <= 0.0:
             scale = 1.0
         self.register_buffer(
             "positional_encoding_gaussian_matrix",
-            scale * paddle.randn(shape=(2, num_pos_feats)), )
+            scale * paddle.randn(shape=(2, num_pos_feats)),
+        )
 
     def _pe_encoding(self, coords: paddle.Tensor) -> paddle.Tensor:
         """Positionally encode points that are normalized to [0,1]."""
         coords = 2 * coords - 1
-        coords = coords @self.positional_encoding_gaussian_matrix
+        coords = coords @ self.positional_encoding_gaussian_matrix
         coords = 2 * np.pi * coords
-        return paddle.concat(
-            x=[paddle.sin(x=coords), paddle.cos(x=coords)], axis=-1)
+        return paddle.concat(x=[paddle.sin(x=coords), paddle.cos(x=coords)], axis=-1)
 
     def forward(self, size: Tuple[int, int]) -> paddle.Tensor:
         """Generate positional encoding for a grid of the specified size."""
         h, w = size
-        device: Any = self.positional_encoding_gaussian_matrix.place
+        # device: Any = self.positional_encoding_gaussian_matrix.place
         grid = paddle.ones(shape=(h, w), dtype="float32")
         y_embed = grid.cumsum(axis=0) - 0.5
         x_embed = grid.cumsum(axis=1) - 0.5
@@ -230,9 +214,7 @@ def forward(self, size: Tuple[int, int]) -> paddle.Tensor:
         pe = self._pe_encoding(paddle.stack(x=[x_embed, y_embed], axis=-1))
         return pe.transpose(perm=[2, 0, 1])
 
-    def forward_with_coords(self,
-                            coords_input: paddle.Tensor,
-                            image_size: Tuple[int, int]) -> paddle.Tensor:
+    def forward_with_coords(self, coords_input: paddle.Tensor, image_size: Tuple[int, int]) -> paddle.Tensor:
         """Positionally encode points that are not normalized to [0,1]."""
         coords = coords_input.clone()
         coords[:, :, (0)] = coords[:, :, (0)] / image_size[1]
diff --git a/paddlemix/models/sam/transformer.py b/paddlemix/models/sam/transformer.py
index ef27885ce8a21..a040b99aac6f8 100644
--- a/paddlemix/models/sam/transformer.py
+++ b/paddlemix/models/sam/transformer.py
@@ -15,7 +15,6 @@
 import math
 from typing import Tuple, Type
 
-import paddle
 import paddle.nn.functional as F
 from paddle import Tensor, nn
 
@@ -24,13 +23,14 @@
 
 class TwoWayTransformer(nn.Layer):
     def __init__(
-            self,
-            depth: int,
-            embedding_dim: int,
-            num_heads: int,
-            mlp_dim: int,
-            activation: Type[nn.Layer]=nn.ReLU,
-            attention_downsample_rate: int=2, ) -> None:
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Layer] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
         """
         A transformer decoder that attends to an input image using
         queries whose positional embedding is supplied.
@@ -57,13 +57,13 @@ def __init__(
                     mlp_dim=mlp_dim,
                     activation=activation,
                     attention_downsample_rate=attention_downsample_rate,
-                    skip_first_layer_pe=i == 0, ))
-        self.final_attn_token_to_image = Attention(
-            embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
+                    skip_first_layer_pe=i == 0,
+                )
+            )
+        self.final_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
         self.norm_final_attn = nn.LayerNorm(embedding_dim)
 
-    def forward(self, image_embedding, image_pe,
-                point_embedding) -> Tuple[Tensor, Tensor]:
+    def forward(self, image_embedding, image_pe, point_embedding) -> Tuple[Tensor, Tensor]:
         """
         Args:
           image_embedding (paddle.Tensor): image to attend to. Should be shape
@@ -83,11 +83,7 @@ def forward(self, image_embedding, image_pe,
         queries = point_embedding
         keys = image_embedding
         for layer in self.layers:
-            queries, keys = layer(
-                queries=queries,
-                keys=keys,
-                query_pe=point_embedding,
-                key_pe=image_pe)
+            queries, keys = layer(queries=queries, keys=keys, query_pe=point_embedding, key_pe=image_pe)
         q = queries + point_embedding
         k = keys + image_pe
         attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
@@ -98,13 +94,14 @@ def forward(self, image_embedding, image_pe,
 
 class TwoWayAttentionBlock(nn.Layer):
     def __init__(
-            self,
-            embedding_dim: int,
-            num_heads: int,
-            mlp_dim: int=2048,
-            activation: Type[nn.Layer]=nn.ReLU,
-            attention_downsample_rate: int=2,
-            skip_first_layer_pe: bool=False, ) -> None:
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Layer] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
         """
         A transformer block with four layers: (1) self-attention of sparse
         inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
@@ -121,14 +118,12 @@ def __init__(
         super().__init__()
         self.self_attn = Attention(embedding_dim, num_heads)
         self.norm1 = nn.LayerNorm(embedding_dim)
-        self.cross_attn_token_to_image = Attention(
-            embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
+        self.cross_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
         self.norm2 = nn.LayerNorm(embedding_dim)
         self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
         self.norm3 = nn.LayerNorm(embedding_dim)
         self.norm4 = nn.LayerNorm(embedding_dim)
-        self.cross_attn_image_to_token = Attention(
-            embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
+        self.cross_attn_image_to_token = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
         self.skip_first_layer_pe = skip_first_layer_pe
 
     def forward(self, queries, keys, query_pe, key_pe) -> Tuple[Tensor, Tensor]:
@@ -161,16 +156,12 @@ class Attention(nn.Layer):
     after projection to queries, keys, and values.
     """
 
-    def __init__(self,
-                 embedding_dim: int,
-                 num_heads: int,
-                 downsample_rate: int=1) -> None:
+    def __init__(self, embedding_dim: int, num_heads: int, downsample_rate: int = 1) -> None:
         super().__init__()
         self.embedding_dim = embedding_dim
         self.internal_dim = embedding_dim // downsample_rate
         self.num_heads = num_heads
-        assert (self.internal_dim % num_heads == 0
-                ), "num_heads must divide embedding_dim."
+        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
         self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
         self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
         self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
@@ -194,10 +185,10 @@ def forward(self, q, k, v):
         k = self._separate_heads(k, self.num_heads)
         v = self._separate_heads(v, self.num_heads)
         _, _, _, c_per_head = q.shape
-        attn = q @k.transpose([0, 1, 3, 2])
+        attn = q @ k.transpose([0, 1, 3, 2])
         attn = attn / math.sqrt(c_per_head)
         attn = F.softmax(attn, axis=-1)
-        out = attn @v
+        out = attn @ v
         out = self._recombine_heads(out)
         out = self.out_proj(out)
         return out
diff --git a/paddlemix/models/visualglm/configuration.py b/paddlemix/models/visualglm/configuration.py
index a0d326cb0502c..9ec9ae6ce6c5a 100644
--- a/paddlemix/models/visualglm/configuration.py
+++ b/paddlemix/models/visualglm/configuration.py
@@ -72,22 +72,23 @@ class VisualGLMVisionConfig(PretrainedConfig):
     model_type = "visualglm_vision_model"
 
     def __init__(
-            self,
-            hidden_size=1408,
-            intermediate_size=6144,
-            num_hidden_layers=39,
-            num_attention_heads=16,
-            num_channels=3,
-            image_size=224,
-            patch_size=14,
-            hidden_act="gelu",
-            layer_norm_eps=0.00001,
-            dropout=0.1,
-            attention_dropout=0.1,
-            initializer_range=1e-10,
-            initializer_factor=1.0,
-            qkv_bias=True,
-            **kwargs, ):
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
@@ -107,17 +108,13 @@ def __init__(
         self.qkv_bias = qkv_bias
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the vision config dict if we are loading from VisualGLMConfig
         if config_dict.get("model_type") == "visualglm":
             config_dict = config_dict["vision_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -181,23 +178,24 @@ class VisualGLMQFormerConfig(PretrainedConfig):
     model_type = "visualglm_qformer_model"
 
     def __init__(
-            self,
-            hidden_size=768,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            intermediate_size=3072,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            initializer_range=0.02,
-            layer_norm_eps=1e-12,
-            pad_token_id=0,
-            position_embedding_type="absolute",
-            classifier_dropout=None,
-            cross_attention_frequency=2,
-            encoder_hidden_size=1408,
-            **kwargs, ):
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
         self.hidden_size = hidden_size
@@ -216,18 +214,14 @@ def __init__(
         self.encoder_hidden_size = encoder_hidden_size
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from VisualGLMConfig
         if config_dict.get("model_type") == "visualglm":
             config_dict = config_dict["qformer_config"]
 
-        if ("model_type" in config_dict and hasattr(cls, "model_type") and
-                config_dict["model_type"] != cls.model_type):
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -280,42 +274,34 @@ class VisualGLMConfig(PretrainedConfig):
     model_type = "visualglm"
 
     def __init__(
-            self,
-            vision_config=None,
-            qformer_config=None,
-            text_config=None,
-            num_query_tokens=32,
-            **kwargs, ):
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         if vision_config is None:
             vision_config = {}
-            logger.info(
-                "vision_config is None. initializing the VisualGLMVisionConfig with default values."
-            )
+            logger.info("vision_config is None. initializing the VisualGLMVisionConfig with default values.")
 
         if qformer_config is None:
             qformer_config = {}
-            logger.info(
-                "qformer_config is None. Initializing the VisualGLMQFormerConfig with default values."
-            )
+            logger.info("qformer_config is None. Initializing the VisualGLMQFormerConfig with default values.")
 
         if text_config is None:
             text_config = {}
-            logger.info(
-                "text_config is None. Initializing the text config with default values (`ChatGLMConfig`)."
-            )
+            logger.info("text_config is None. Initializing the text config with default values (`ChatGLMConfig`).")
         self.vision_config = VisualGLMVisionConfig(**vision_config)
         self.qformer_config = VisualGLMQFormerConfig(**qformer_config)
-        text_model_type = (text_config["model_type"]
-                           if "model_type" in text_config else "chatglm")
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "chatglm"
 
         if text_model_type == "chatglm":
             self.text_config = ChatGLMConfig(**text_config)
         else:
-            raise ValueError(
-                "Only chatglm accepted for model_type, but accepted {}.".format(
-                    text_model_type))
+            raise ValueError("Only chatglm accepted for model_type, but accepted {}.".format(text_model_type))
 
         self.num_query_tokens = num_query_tokens
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
@@ -325,11 +311,12 @@ def __init__(
 
     @classmethod
     def from_vision_qformer_text_configs(
-            cls,
-            vision_config: VisualGLMVisionConfig,
-            qformer_config: VisualGLMQFormerConfig,
-            text_config: PretrainedConfig,
-            **kwargs, ):
+        cls,
+        vision_config: VisualGLMVisionConfig,
+        qformer_config: VisualGLMQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
         r"""
         Instantiate a [`VisualGLMConfig`] (or a derived class) from a vision model, Q-Former and language model
         configurations.
@@ -341,7 +328,8 @@ def from_vision_qformer_text_configs(
             vision_config=vision_config.to_dict(),
             qformer_config=qformer_config.to_dict(),
             text_config=text_config.to_dict(),
-            **kwargs, )
+            **kwargs,
+        )
 
     def to_dict(self):
         """
diff --git a/paddlemix/models/visualglm/modeling.py b/paddlemix/models/visualglm/modeling.py
index 6358f9cd8ca20..f478f79caf776 100644
--- a/paddlemix/models/visualglm/modeling.py
+++ b/paddlemix/models/visualglm/modeling.py
@@ -24,18 +24,27 @@
 from paddlenlp.transformers.chatglm.configuration import ChatGLMConfig
 from paddlenlp.transformers.chatglm.modeling import ChatGLMForCausalLM
 from paddlenlp.transformers.model_outputs import (
-    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions,
-    ModelOutput)
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
 from paddlenlp.transformers.model_utils import (
-    PretrainedModel, apply_chunking_to_forward,
-    find_pruneable_heads_and_indices, prune_linear_layer)
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
 
 from ...activations import ACT2FN
 from ...utils.initializer import normal_, ones_, zeros_
 from ...utils.log import logger
-from .configuration import (VisualGLMConfig, VisualGLMQFormerConfig,
-                            VisualGLMVisionConfig)
+from .configuration import (
+    VisualGLMConfig,
+    VisualGLMQFormerConfig,
+    VisualGLMVisionConfig,
+)
 
 VisualGLM_PRETRAINED_MODEL_ARCHIVE_LIST = []
 
@@ -53,7 +62,8 @@ def Parameter(tensor, dtype="float16"):
     return paddle.create_parameter(
         tensor.shape,
         dtype=tensor.dtype,
-        default_initializer=nn.initializer.Assign(tensor), )
+        default_initializer=nn.initializer.Assign(tensor),
+    )
 
 
 @dataclass
@@ -81,9 +91,11 @@ class VisualGLMForConditionalGenerationModelOutput(ModelOutput):
 
     def to_tuple(self) -> Tuple[Any]:
         return tuple(
-            self[k] if k not in
-            ["vision_outputs", "qformer_outputs", "language_model_outputs"] else
-            getattr(self, k).to_tuple() for k in self.keys())
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
 
 
 class VisualGLMPretrainedModel(PretrainedModel):
@@ -95,13 +107,14 @@ class VisualGLMPretrainedModel(PretrainedModel):
     config_class = VisualGLMConfig
     base_model_prefix = "visualglm"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids", ]
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+    ]
 
     def _init_weights(self, module):
         """Initialize the weights"""
         factor = self.config.initializer_range
-        if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or
-                isinstance(module, nn.Linear)):
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
             normal_(module.weight, mean=0.0, std=factor)
             if hasattr(module, "bias") and module.bias is not None:
                 zeros_(module.bias)
@@ -111,7 +124,9 @@ def _init_weights(self, module):
                 factor = self.config.vision_config.initializer_range
             trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
             trunc_normal_(module.position_embedding)
-            trunc_normal_(module.class_embedding, )
+            trunc_normal_(
+                module.class_embedding,
+            )
         elif isinstance(module, nn.LayerNorm):
             zeros_(module.bias)
             ones_(module.weight)
@@ -136,30 +151,30 @@ def __init__(self, config: VisualGLMVisionConfig):
             in_channels=self.in_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
-            stride=self.patch_size, )
+            stride=self.patch_size,
+        )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
 
         self.class_embedding = Parameter(
             paddle.randn([1, 1, self.embed_dim]),
-            dtype=self.patch_embedding.weight.dtype, )
+            dtype=self.patch_embedding.weight.dtype,
+        )
         self.position_embedding = Parameter(
             paddle.randn([1, self.num_positions, self.embed_dim]),
-            dtype=self.patch_embedding.weight.dtype, )
+            dtype=self.patch_embedding.weight.dtype,
+        )
 
     def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(
-            pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
 
-        class_embeds = self.class_embedding.expand(
-            [batch_size, 1, -1]).cast(target_dtype)
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
         embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
-        embeddings = embeddings + self.position_embedding[:, :embeddings.shape[
-            1], :].cast(target_dtype)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
         return embeddings
 
 
@@ -175,60 +190,52 @@ def __init__(self, config):
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
         self.dropout = nn.Dropout(config.attention_dropout)
 
         # small tweak here compared to CLIP, no bias here
-        self.qkv = nn.Linear(
-            self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
 
         if config.qkv_bias:
-            q_bias = Parameter(
-                paddle.zeros(
-                    [self.embed_dim], dtype=self.qkv.weight.dtype))
-            v_bias = Parameter(
-                paddle.zeros(
-                    [self.embed_dim], dtype=self.qkv.weight.dtype))
+            q_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype))
+            v_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype))
         else:
             q_bias = None
             v_bias = None
 
         if q_bias is not None:
-            qkv_bias = paddle.concat(
-                (q_bias, paddle.zeros_like(v_bias), v_bias))
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
             self.qkv.bias = Parameter(qkv_bias, dtype=self.qkv.weight.dtype)
 
         self.projection = nn.Linear(self.embed_dim, self.embed_dim)
 
     def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
-        return tensor.reshape(
-            [bsz, seq_len, self.num_heads, self.head_dim]).transpose(
-                [0, 2, 1, 3])
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            head_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=False, ) -> Tuple[
-                paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[
-                    paddle.Tensor]]]:
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         bsz, tgt_len, embed_dim = hidden_states.shape
 
         mixed_qkv = self.qkv(hidden_states)
 
-        mixed_qkv = mixed_qkv.reshape(
-            [bsz, tgt_len, 3, self.num_heads,
-             embed_dim // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
         query_states, key_states, value_states = (
             mixed_qkv[0],
             mixed_qkv[1],
-            mixed_qkv[2], )
+            mixed_qkv[2],
+        )
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = paddle.matmul(
-            query_states, key_states, transpose_y=True)
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
 
         attention_scores = attention_scores * self.scale
 
@@ -243,16 +250,16 @@ def forward(
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
 
-        context_layer = paddle.matmul(attention_probs, value_states).transpose(
-            [0, 2, 1, 3])
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
 
-        new_context_layer_shape = context_layer.shape[:-2] + [self.embed_dim, ]
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
         context_layer = context_layer.reshape(new_context_layer_shape)
 
         output = self.projection(context_layer)
 
-        outputs = (output, attention_probs) if output_attentions else (output,
-                                                                       None)
+        outputs = (output, attention_probs) if output_attentions else (output, None)
 
         return outputs
 
@@ -277,17 +284,16 @@ def __init__(self, config: VisualGLMConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = VisualGLMAttention(config)
-        self.layer_norm1 = nn.LayerNorm(
-            self.embed_dim, epsilon=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
         self.mlp = VisualGLMMLP(config)
-        self.layer_norm2 = nn.LayerNorm(
-            self.embed_dim, epsilon=config.layer_norm_eps)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            attention_mask: paddle.Tensor,
-            output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]:
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
         """
         Args:
             hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -304,7 +310,8 @@ def forward(
         hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             head_mask=attention_mask,
-            output_attentions=output_attentions, )
+            output_attentions=output_attentions,
+        )
         hidden_states = hidden_states + residual
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -312,10 +319,10 @@ def forward(
 
         hidden_states = hidden_states + residual
 
-        outputs = (hidden_states, )
+        outputs = (hidden_states,)
 
         if output_attentions:
-            outputs += (attn_weights, )
+            outputs += (attn_weights,)
 
         return outputs
 
@@ -332,20 +339,17 @@ class VisualGLMEncoder(nn.Layer):
     def __init__(self, config: VisualGLMConfig):
         super().__init__()
         self.config = config
-        self.layers = nn.LayerList([
-            VisualGLMEncoderLayer(config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.LayerList([VisualGLMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            inputs_embeds,
-            attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[Tuple,
-                                                         BaseModelOutput]:
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
             inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -367,13 +371,11 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -381,7 +383,7 @@ def forward(
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states, )
+                encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
 
                 def create_custom_forward(module):
@@ -393,29 +395,30 @@ def custom_forward(*inputs):
                 layer_outputs = recompute(
                     create_custom_forward(encoder_layer),
                     hidden_states,
-                    attention_mask, )
+                    attention_mask,
+                )
             else:
                 layer_outputs = encoder_layer(
                     hidden_states,
                     attention_mask,
-                    output_attentions=output_attentions, )
+                    output_attentions=output_attentions,
+                )
 
             hidden_states = layer_outputs[0]
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1], )
+                all_attentions = all_attentions + (layer_outputs[1],)
 
         if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states, )
+            encoder_states = encoder_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(
-                v for v in [hidden_states, encoder_states, all_attentions]
-                if v is not None)
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
             last_hidden_state=hidden_states,
             hidden_states=encoder_states,
-            attentions=all_attentions, )
+            attentions=all_attentions,
+        )
 
 
 class VisualGLMVisionModel(VisualGLMPretrainedModel):
@@ -429,26 +432,23 @@ def __init__(self, config: VisualGLMVisionConfig):
 
         self.embeddings = VisualGLMVisionEmbeddings(config)
         self.encoder = VisualGLMEncoder(config)
-        self.post_layernorm = nn.LayerNorm(
-            embed_dim, epsilon=config.layer_norm_eps)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
 
     def forward(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, BaseModelOutputWithPooling]:
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
@@ -459,7 +459,8 @@ def forward(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.post_layernorm(last_hidden_state)
@@ -474,7 +475,8 @@ def forward(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions, )
+            attentions=encoder_outputs.attentions,
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -484,35 +486,29 @@ class VisualGLMQFormerMultiHeadAttention(nn.Layer):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
         self.config = config
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, "embedding_size"):
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
-                % (config.hidden_size, config.num_attention_heads))
+                % (config.hidden_size, config.num_attention_heads)
+            )
 
         self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size /
-                                       config.num_attention_heads)
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
         if is_cross_attention:
             self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
-            self.value = nn.Linear(config.encoder_hidden_size,
-                                   self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
         else:
             self.key = nn.Linear(config.hidden_size, self.all_head_size)
             self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(
-            config, "position_embedding_type", "absolute")
-        if (self.position_embedding_type == "relative_key" or
-                self.position_embedding_type == "relative_key_query"):
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
         self.save_attention = False
 
     def save_attn_gradients(self, attn_gradients):
@@ -536,30 +532,28 @@ def transpose_for_scores(self, x):
         return x.transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
         # such that the encoder's padding tokens are not attended to.
         is_cross_attention = encoder_hidden_states is not None
         if is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
-            value_layer = paddle.concat(
-                [past_key_value[1], value_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
         else:
             key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
@@ -571,37 +565,25 @@ def forward(
         past_key_value = (key_layer, value_layer)
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = paddle.matmul(
-            query_layer, key_layer, transpose_y=True)
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
 
-        if (self.position_embedding_type == "relative_key" or
-                self.position_embedding_type == "relative_key_query"):
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             seq_length = hidden_states.shape[1]
-            position_ids_l = paddle.arange(
-                seq_length, dtype="int64").reshape([-1, 1])
-            position_ids_r = paddle.arange(
-                seq_length, dtype="int64").reshape([1, -1])
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
             distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.cast(
-                dtype=query_layer.dtype)  # fp16 compatibility
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
 
             if self.position_embedding_type == "relative_key":
-                relative_position_scores = paddle.einsum(
-                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = paddle.einsum(
-                    "bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = paddle.einsum(
-                    "bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = (
-                    attention_scores + relative_position_scores_query +
-                    relative_position_scores_key)
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
@@ -630,10 +612,9 @@ def forward(
         ]
         context_layer = context_layer.reshape(new_context_layer_shape)
 
-        outputs = ((context_layer, attention_probs)
-                   if output_attentions else (context_layer, ))
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        outputs = outputs + (past_key_value, )
+        outputs = outputs + (past_key_value,)
         return outputs
 
 
@@ -641,12 +622,10 @@ class VisualGLMQFormerSelfOutput(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states: paddle.Tensor,
-                input_tensor: paddle.Tensor) -> paddle.Tensor:
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
@@ -656,8 +635,7 @@ def forward(self, hidden_states: paddle.Tensor,
 class VisualGLMQFormerAttention(nn.Layer):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
-        self.attention = VisualGLMQFormerMultiHeadAttention(config,
-                                                            is_cross_attention)
+        self.attention = VisualGLMQFormerMultiHeadAttention(config, is_cross_attention)
         self.output = VisualGLMQFormerSelfOutput(config)
         self.pruned_heads = set()
 
@@ -668,7 +646,8 @@ def prune_heads(self, heads):
             heads,
             self.attention.num_attention_heads,
             self.attention.attention_head_size,
-            self.pruned_heads, )
+            self.pruned_heads,
+        )
 
         # Prune linear layers
         self.attention.query = prune_linear_layer(self.attention.query, index)
@@ -677,21 +656,20 @@ def prune_heads(self, heads):
         self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
 
         # Update hyper params and store pruned heads
-        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
-            heads)
-        self.attention.all_head_size = (self.attention.attention_head_size *
-                                        self.attention.num_attention_heads)
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            attention_mask: Optional[paddle.Tensor]=None,
-            head_mask: Optional[paddle.Tensor]=None,
-            encoder_hidden_states: Optional[paddle.Tensor]=None,
-            encoder_attention_mask: Optional[paddle.Tensor]=None,
-            past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]]=None,
-            output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]:
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
         self_outputs = self.attention(
             hidden_states,
             attention_mask,
@@ -699,10 +677,10 @@ def forward(
             encoder_hidden_states,
             encoder_attention_mask,
             past_key_value,
-            output_attentions, )
+            output_attentions,
+        )
         attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
 
@@ -728,8 +706,7 @@ def __init__(self, config):
         # self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states: paddle.Tensor,
-                input_tensor: paddle.Tensor) -> paddle.Tensor:
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = hidden_states + input_tensor
@@ -742,15 +719,13 @@ def __init__(self, config, layer_idx):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.input_layernorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.attention = VisualGLMQFormerAttention(config)
 
         self.layer_idx = layer_idx
 
         if layer_idx % config.cross_attention_frequency == 0:
-            self.crossattention = VisualGLMQFormerAttention(
-                config, is_cross_attention=True)
+            self.crossattention = VisualGLMQFormerAttention(config, is_cross_attention=True)
             self.has_cross_attention = True
         else:
             self.has_cross_attention = False
@@ -759,25 +734,26 @@ def __init__(self, config, layer_idx):
         self.output_query = VisualGLMQFormerOutput(config)
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_value=None,
-            output_attentions=False,
-            query_length=0, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = (past_key_value[:2]
-                                    if past_key_value is not None else None)
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         hidden_states = self.input_layernorm(hidden_states)
         self_attention_outputs = self.attention(
             hidden_states,  # 1, 32, 768
             attention_mask,
             head_mask,
             output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value, )
+            past_key_value=self_attn_past_key_value,
+        )
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:-1]
 
@@ -788,16 +764,15 @@ def forward(
 
             if self.has_cross_attention:
                 if encoder_hidden_states is None:
-                    raise ValueError(
-                        "encoder_hidden_states must be given for cross-attention layers"
-                    )
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
                 cross_attention_outputs = self.crossattention(
                     query_attention_output,
                     attention_mask,
                     head_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
-                    output_attentions=output_attentions, )
+                    output_attentions=output_attentions,
+                )
                 query_attention_output = cross_attention_outputs[0]
                 # add cross attentions if we output attention weights
                 outputs = outputs + cross_attention_outputs[1:-1]
@@ -806,25 +781,27 @@ def forward(
                 self.feed_forward_chunk_query,
                 self.chunk_size_feed_forward,
                 self.seq_len_dim,
-                query_attention_output, )
+                query_attention_output,
+            )
 
             if attention_output.shape[1] > query_length:
                 layer_output_text = apply_chunking_to_forward(
                     self.feed_forward_chunk,
                     self.chunk_size_feed_forward,
                     self.seq_len_dim,
-                    attention_output[:, query_length:, :], )
-                layer_output = paddle.concat(
-                    [layer_output, layer_output_text], axis=1)
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
         else:
             layer_output = apply_chunking_to_forward(
                 self.feed_forward_chunk,
                 self.chunk_size_feed_forward,
                 self.seq_len_dim,
-                attention_output, )
-        outputs = (layer_output, ) + outputs
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
 
-        outputs = outputs + (present_key_value, )
+        outputs = outputs + (present_key_value,)
 
         return outputs
 
@@ -843,25 +820,25 @@ class VisualGLMQFormerEncoder(nn.Layer):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.LayerList([
-            VisualGLMQFormerLayer(config, layer_idx)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.LayerList(
+            [VisualGLMQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
-            query_length=0, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions else None
@@ -871,14 +848,12 @@ def forward(
         for i in range(self.config.num_hidden_layers):
             layer_module = self.layer[i]
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
 
-            if getattr(self.config, "gradient_checkpointing",
-                       False) and self.training:
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
                 if use_cache:
                     logger.warn(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -887,8 +862,7 @@ def forward(
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions, query_length)
+                        return module(*inputs, past_key_value, output_attentions, query_length)
 
                     return custom_forward
 
@@ -898,7 +872,8 @@ def custom_forward(*inputs):
                     attention_mask,
                     layer_head_mask,
                     encoder_hidden_states,
-                    encoder_attention_mask, )
+                    encoder_attention_mask,
+                )
             else:
                 layer_outputs = layer_module(
                     hidden_states,
@@ -908,35 +883,39 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                     past_key_value,
                     output_attentions,
-                    query_length, )
+                    query_length,
+                )
 
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
+                next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1], )
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                 if layer_module.has_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v
-                         for v in [
-                             hidden_states,
-                             next_decoder_cache,
-                             all_hidden_states,
-                             all_self_attentions,
-                             all_cross_attentions,
-                         ] if v is not None)
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
         return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions, )
+            cross_attentions=all_cross_attentions,
+        )
 
 
 class VisualGLMQFormerModel(VisualGLMPretrainedModel):
@@ -948,8 +927,7 @@ def __init__(self, config: VisualGLMQFormerConfig):
         super().__init__(config)
         self.config = config
 
-        self.final_layernorm = nn.LayerNorm(
-            config.hidden_size, epsilon=config.layer_norm_eps)
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         self.encoder = VisualGLMQFormerEncoder(config)
@@ -969,10 +947,11 @@ class PreTrainedModel
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def get_extended_attention_mask(
-            self,
-            attention_mask: paddle.Tensor,
-            input_shape: Tuple[int],
-            has_query: bool=False, ) -> paddle.Tensor:
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
         Arguments:
@@ -993,21 +972,21 @@ def get_extended_attention_mask(
             extended_attention_mask = attention_mask[:, None, None, :]
         else:
             raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".
-                format(input_shape, attention_mask.shape))
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.cast(
-            dtype=self.config.dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         return extended_attention_mask
 
-    def invert_attention_mask(
-            self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
         """
         Invert an attention mask (e.g., switches 0. and 1.).
         Args:
@@ -1016,28 +995,27 @@ def invert_attention_mask(
             `paddle.Tensor`: The inverted attention mask.
         """
         if encoder_attention_mask.ndim == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:,
-                                                                     None, :, :]
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
         if encoder_attention_mask.ndim == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None,
-                                                                     None, :]
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
         # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
         # /transformer/transformer_layers.py#L270
         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
         # encoder_extended_attention_mask.transpose(-1, -2))
         encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
-            dtype=self.config.dtype)  # fp16 compatibility
-        encoder_extended_attention_mask = (
-            1.0 - encoder_extended_attention_mask) * -1e4
+            dtype=self.config.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
 
         return encoder_extended_attention_mask
 
     def get_head_mask(
-            self,
-            head_mask: Optional[paddle.Tensor],
-            num_hidden_layers: int,
-            is_attention_chunked: bool=False, ) -> paddle.Tensor:
+        self,
+        head_mask: Optional[paddle.Tensor],
+        num_hidden_layers: int,
+        is_attention_chunked: bool = False,
+    ) -> paddle.Tensor:
         """
         Prepare the head mask if needed.
         Args:
@@ -1052,8 +1030,7 @@ def get_head_mask(
             `[None]` for each layer.
         """
         if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask,
-                                                      num_hidden_layers)
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
             if is_attention_chunked is True:
                 head_mask = head_mask.unsqueeze(-1)
         else:
@@ -1064,30 +1041,27 @@ def get_head_mask(
     def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
         """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
         if head_mask.ndim == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-                -1).unsqueeze(-1)
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
             head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
         elif head_mask.ndim == 2:
-            head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                         )  # We can specify head_mask for each layer
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
         assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.cast(
-            dtype=self.config.
-            dtype)  # switch to float if need + fp16 compatibility
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
         return head_mask
 
     def forward(
-            self,
-            query_embeds,
-            attention_mask=None,
-            head_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            past_key_values=None,
-            use_cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None, ):
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
         r"""
         encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -1107,18 +1081,16 @@ def forward(
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # past_key_values_length
         past_key_values_length = (
-            past_key_values[0][0].shape[2] - self.config.query_length
-            if past_key_values is not None else 0)
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
 
         query_length = query_embeds.shape[1] if query_embeds is not None else 0
 
@@ -1128,39 +1100,32 @@ def forward(
         batch_size, seq_length = input_shape
 
         if attention_mask is None:
-            attention_mask = paddle.ones((
-                (batch_size, seq_length + past_key_values_length)))
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(
-            attention_mask, input_shape)
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
 
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if encoder_hidden_states is not None:
             if type(encoder_hidden_states) == list:
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
-                    0].shape
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
             else:
                 (
                     encoder_batch_size,
                     encoder_sequence_length,
-                    _, ) = encoder_hidden_states.shape
+                    _,
+                ) = encoder_hidden_states.shape
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
 
             if type(encoder_attention_mask) == list:
-                encoder_extended_attention_mask = [
-                    self.invert_attention_mask(mask)
-                    for mask in encoder_attention_mask
-                ]
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
             elif encoder_attention_mask is None:
                 encoder_attention_mask = paddle.ones(encoder_hidden_shape)
-                encoder_extended_attention_mask = self.invert_attention_mask(
-                    encoder_attention_mask)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
             else:
-                encoder_extended_attention_mask = self.invert_attention_mask(
-                    encoder_attention_mask)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
 
@@ -1182,7 +1147,8 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            query_length=query_length, )
+            query_length=query_length,
+        )
         sequence_output = encoder_outputs[0]
         sequence_output = self.final_layernorm(sequence_output)
         pooled_output = sequence_output[:, 0, :]
@@ -1196,7 +1162,8 @@ def forward(
             past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions, )
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
 
 
 class VisualGLMModel(VisualGLMPretrainedModel):
@@ -1208,27 +1175,26 @@ def __init__(self, config: VisualGLMConfig):
 
         self.vision_model = VisualGLMVisionModel(config.vision_config)
         self.query_tokens = Parameter(
-            paddle.zeros([
-                1, config.num_query_tokens, config.qformer_config.hidden_size
-            ]),
-            dtype=self.config.dtype, )
+            paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]),
+            dtype=self.config.dtype,
+        )
         self.qformer = VisualGLMQFormerModel(config.qformer_config)
 
-        self.language_projection = nn.Linear(config.qformer_config.hidden_size,
-                                             config.text_config.hidden_size)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
         self.language_model = ChatGLMForCausalLM(config.text_config)
 
     def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
     def get_text_features(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs, ):
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
         r"""
         Returns:
             text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
@@ -1246,30 +1212,30 @@ def get_text_features(
         >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False)
         >>> text_features = model.get_text_features(**inputs)
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_outputs = self.language_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         return text_outputs
 
     def get_image_features(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs, ):
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
         r"""
         Returns:
             vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
@@ -1289,32 +1255,30 @@ def get_image_features(
         >>> inputs = processor.process_images(images=image, return_tensors="pd")
         >>> image_outputs = model.get_image_features(**inputs)
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         return vision_outputs
 
     def get_qformer_features(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            **kwargs, ):
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
         r"""
         Returns:
             vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
@@ -1334,56 +1298,51 @@ def get_qformer_features(
         >>> inputs = processor.process_images(images=image, return_tensors="pd")
         >>> qformer_outputs = model.get_qformer_features(**inputs)
         ```"""
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         image_embeds = vision_outputs[0]
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True, )
+            return_dict=True,
+        )
 
         return query_outputs
 
     def forward(
-            self,
-            pixel_values: paddle.Tensor,  # processed image
-            first_input_ids: paddle.Tensor,
-            second_input_ids: paddle.Tensor,
-            first_attention_mask: Optional[paddle.Tensor]=None,
-            second_attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            labels: Optional[paddle.Tensor]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, VisualGLMForConditionalGenerationModelOutput]:
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VisualGLMForConditionalGenerationModelOutput]:
         r"""
         Returns:
         Examples:
@@ -1401,68 +1360,60 @@ def forward(
         >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
         >>> outputs = model(**inputs)
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
         vision_outputs = self.vision_model(pixel_values, return_dict=True)
         image_embeds = vision_outputs.last_hidden_state
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         # step 3: use the language model, conditioned on the text and image
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = paddle.ones(
-            language_model_inputs.shape[:-1], dtype="int64")
-
-        first_embeds = self.language_model.chatglm.transformer.word_embeddings(
-            first_input_ids)
-        second_embeds = self.language_model.chatglm.word_embeddings(
-            second_input_ids)
-        language_model_inputs = paddle.cast(
-            language_model_inputs, dtype=first_embeds.dtype)
-        inputs_embeds = paddle.concat(
-            [first_embeds, language_model_inputs, second_embeds], axis=1)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.chatglm.transformer.word_embeddings(first_input_ids)
+        second_embeds = self.language_model.chatglm.word_embeddings(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
 
         if first_attention_mask is None:
-            first_attention_mask = paddle.ones_like(
-                first_embeds.shape[:-1], dtype="int64")
+            first_attention_mask = paddle.ones_like(first_embeds.shape[:-1], dtype="int64")
         if second_attention_mask is None:
-            second_attention_mask = paddle.ones_like(
-                second_embeds.shape[:-1], dtype="int64")
+            second_attention_mask = paddle.ones_like(second_embeds.shape[:-1], dtype="int64")
         attention_mask = paddle.concat(
             [
                 first_attention_mask,
                 language_model_attention_mask,
                 second_attention_mask,
             ],
-            axis=1, )
+            axis=1,
+        )
 
         outputs = self.language_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         logits = outputs.logits if return_dict else outputs[0]
         loss = None
         # we compute the loss here since we need to take into account the sequence length of the query embeds
         if labels is not None:
-            logits = logits[:, -labels.shape[1]:, :]
+            logits = logits[:, -labels.shape[1] :, :]
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :]
             shift_labels = labels[..., 1:]
@@ -1472,18 +1423,20 @@ def forward(
 
             loss = loss_fct(
                 shift_logits.reshape([-1, self.config.text_config.vocab_size]),
-                shift_labels.reshape([-1]), )
+                shift_labels.reshape([-1]),
+            )
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
-            return ((loss, ) + output) if loss is not None else output
+            return ((loss,) + output) if loss is not None else output
 
         return VisualGLMForConditionalGenerationModelOutput(
             loss=loss,
             logits=logits,
             vision_outputs=vision_outputs,
             qformer_outputs=query_outputs,
-            language_model_outputs=outputs, )
+            language_model_outputs=outputs,
+        )
 
 
 class ChatGLMForConditionalGenerationWithImage(ChatGLMForCausalLM):
@@ -1492,27 +1445,25 @@ def __init__(self, config: ChatGLMConfig):
         self.config = config
 
     def forward(
-            self,
-            image_features: paddle.Tensor,
-            input_ids: paddle.Tensor,
-            position_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            pre_image_length: Optional[int]=None,
-            cache: Optional[Tuple[paddle.Tensor]]=None,
-            inputs_embeds: Optional[paddle.Tensor]=None,
-            labels: Optional[paddle.Tensor]=None,
-            use_cache: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ):
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        self,
+        image_features: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        pre_image_length: Optional[int] = None,
+        cache: Optional[Tuple[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if inputs_embeds is None and cache is None and image_features is not None:
-            pre_ids, pad_ids, post_ids = paddle.split(
-                input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1)
+            pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1)
             pre_txt_emb = self.chatglm.transformer.word_embeddings(pre_ids)
             post_txt_emb = self.chatglm.transformer.word_embeddings(post_ids)
-            inputs_embeds = paddle.concat(
-                [pre_txt_emb, image_features, post_txt_emb], axis=1)
+            inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1)
 
         outputs = super().forward(
             input_ids=input_ids,
@@ -1522,7 +1473,8 @@ def forward(
             inputs_embeds=inputs_embeds,
             labels=labels,
             use_cache=use_cache,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         return outputs
 
@@ -1536,44 +1488,37 @@ def __init__(self, config: VisualGLMConfig):
         self.config = config
         self.vision_model = VisualGLMVisionModel(config.vision_config)
         self.query_tokens = Parameter(
-            paddle.zeros([
-                1, config.num_query_tokens, config.qformer_config.hidden_size
-            ]),
-            dtype=self.config.dtype, )
+            paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]),
+            dtype=self.config.dtype,
+        )
         self.qformer = VisualGLMQFormerModel(config.qformer_config)
-        self.language_projection = nn.Linear(config.qformer_config.hidden_size,
-                                             config.text_config.hidden_size)
-        self.language_model = ChatGLMForConditionalGenerationWithImage(
-            config.text_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = ChatGLMForConditionalGenerationWithImage(config.text_config)
 
     def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
     def encode_images(
-            self,
-            pixel_values: paddle.Tensor,  # processed image
+        self,
+        pixel_values: paddle.Tensor,  # processed image
     ):
         # step 1: forward the images through the vision encoder,
         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
-        pixel_values = paddle.cast(
-            pixel_values,
-            self.vision_model.embeddings.patch_embedding.weight.dtype)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
         vision_outputs = self.vision_model(pixel_values, return_dict=True)
         image_embeds = vision_outputs.last_hidden_state
-        image_attention_mask = paddle.ones(
-            image_embeds.shape[:-1], dtype="int64")
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
 
         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
         query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
-        query_tokens = paddle.cast(query_tokens,
-                                   self.qformer.final_layernorm.weight.dtype)
-        image_embeds = paddle.cast(image_embeds,
-                                   self.qformer.final_layernorm.weight.dtype)
+        query_tokens = paddle.cast(query_tokens, self.qformer.final_layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.final_layernorm.weight.dtype)
         query_outputs = self.qformer(
             query_embeds=query_tokens,
             encoder_hidden_states=image_embeds,
             encoder_attention_mask=image_attention_mask,
-            return_dict=True, )
+            return_dict=True,
+        )
         query_output = query_outputs.last_hidden_state
 
         # step 3: mapping query_output into language_model space
@@ -1583,12 +1528,13 @@ def encode_images(
 
     @paddle.no_grad()
     def generate(
-            self,
-            pixel_values: paddle.Tensor,
-            input_ids: paddle.Tensor,
-            pre_image_length: int,
-            attention_mask: Optional[paddle.Tensor]=None,
-            **generate_kwargs, ) -> paddle.Tensor:
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        pre_image_length: int,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
         """
         Overrides `generate` function to be able to use the model as a conditional generator.
         Args:
@@ -1625,6 +1571,7 @@ def generate(
             image_features=image_features,
             pre_image_length=pre_image_length,
             attention_mask=attention_mask,
-            **generate_kwargs, )
+            **generate_kwargs,
+        )
 
         return outputs
diff --git a/paddlemix/optimization.py b/paddlemix/optimization.py
index c11363d24087c..70d5400739722 100644
--- a/paddlemix/optimization.py
+++ b/paddlemix/optimization.py
@@ -42,15 +42,16 @@ class CosineDecayWithWarmup(LRScheduler):
     """
 
     def __init__(
-            self,
-            learning_rate,
-            epochs,
-            eta_min=0.0,
-            warmup_steps=0,
-            warmup_start_lr=0.0,
-            last_epoch=-1,
-            step_each_epoch=1,
-            **kwargs, ):
+        self,
+        learning_rate,
+        epochs,
+        eta_min=0.0,
+        warmup_steps=0,
+        warmup_start_lr=0.0,
+        last_epoch=-1,
+        step_each_epoch=1,
+        **kwargs,
+    ):
         self.start_lr = learning_rate
         self.T_max = epochs
         self.eta_min = eta_min
@@ -70,12 +71,13 @@ def step(self):
         cur_step_in_epoch = (self.cur_step - 2) % self.step_each_epoch
         cur_epoch = (self.cur_step - 2) // self.step_each_epoch
         if self.cur_step < self.warmup_steps and cur_epoch == 0:
-            self.last_lr = self.warmup_start_lr + (
-                self.start_lr - self.warmup_start_lr) * cur_step_in_epoch / max(
-                    self.warmup_steps, 1)
+            self.last_lr = self.warmup_start_lr + (self.start_lr - self.warmup_start_lr) * cur_step_in_epoch / max(
+                self.warmup_steps, 1
+            )
         else:
             self.last_lr = (self.start_lr - self.eta_min) * 0.5 * (
-                1.0 + math.cos(math.pi * cur_epoch / self.T_max)) + self.eta_min
+                1.0 + math.cos(math.pi * cur_epoch / self.T_max)
+            ) + self.eta_min
         self.last_epoch = cur_epoch
 
     def get_lr(self):
@@ -164,11 +166,8 @@ def get_parameters(args, model, assigner, tower):
     skip = set()
     if tower == "visual":
         lr = args.visual_lr if args.visual_lr is not None else args.learning_rate
-        weight_decay = (args.visual_wd
-                        if args.visual_wd is not None else args.weight_decay)
-        filter_parameters = [[name, param]
-                             for name, param in model.named_parameters()
-                             if "visual." in name]
+        weight_decay = args.visual_wd if args.visual_wd is not None else args.weight_decay
+        filter_parameters = [[name, param] for name, param in model.named_parameters() if "visual." in name]
         if hasattr(model, "visual"):
             if hasattr(model.visual, "no_weight_decay"):
                 skip = set.union(skip, model.visual.no_weight_decay())
@@ -176,9 +175,7 @@ def get_parameters(args, model, assigner, tower):
     elif tower == "text":
         lr = args.text_lr if args.text_lr is not None else args.learning_rate
         weight_decay = args.text_wd if args.text_wd is not None else args.weight_decay
-        filter_parameters = [[name, param]
-                             for name, param in model.named_parameters()
-                             if "text." in name]
+        filter_parameters = [[name, param] for name, param in model.named_parameters() if "text." in name]
         if hasattr(model, "text"):
             if hasattr(model.text, "no_weight_decay"):
                 skip = set.union(skip, model.text.no_weight_decay())
@@ -187,8 +184,7 @@ def get_parameters(args, model, assigner, tower):
         lr = args.learning_rate
         weight_decay = args.weight_decay
         exclude = lambda n: "visual." not in n and "text." not in n
-        filter_parameters = [[n, p] for n, p in model.named_parameters()
-                             if exclude(n)]
+        filter_parameters = [[n, p] for n, p in model.named_parameters() if exclude(n)]
         if hasattr(model, "no_weight_decay"):
             skip = set.union(skip, model.no_weight_decay())
     get_num_layer = assigner.get_layer_id if assigner is not None else None
@@ -236,11 +232,8 @@ def get_parameters(args, model, assigner, tower):
     if is_master(args):
         logging.info(f"Tower = {tower}")
         logging.info(f"Skip weight decay name marked in tower-{tower}: {skip}")
-        logging.info(
-            f"Num of parameters group in tower-{tower}: {len(parameter_group_vars.values())}"
-        )
-        logging.info(
-            f"Param groups = {json.dumps(parameter_group_names, indent=2)}")
+        logging.info(f"Num of parameters group in tower-{tower}: {len(parameter_group_vars.values())}")
+        logging.info(f"Param groups = {json.dumps(parameter_group_names, indent=2)}")
     return list(parameter_group_vars.values())
 
 
@@ -250,20 +243,19 @@ def get_assigner(args, model):
     if visual_ld < 1.0:
         visual_num_layers = model.visual.get_num_layers()
         assigner_visual = LayerDecayValueAssigner(
-            list(visual_ld**(visual_num_layers + 1 - i)
-                 for i in range(visual_num_layers + 2)))
+            list(visual_ld ** (visual_num_layers + 1 - i) for i in range(visual_num_layers + 2))
+        )
     else:
         assigner_visual = None
     if text_ld < 1.0 and hasattr(model, "text"):
         text_num_layers = model.text.get_num_layers()
         assigner_text = LayerDecayValueAssigner(
-            list(text_ld**(text_num_layers + 1 - i)
-                 for i in range(text_num_layers + 2)))
+            list(text_ld ** (text_num_layers + 1 - i) for i in range(text_num_layers + 2))
+        )
     else:
         assigner_text = None
     if assigner_visual is not None:
-        logging.info("Assigned visual values = %s" %
-                     str(assigner_visual.values))
+        logging.info("Assigned visual values = %s" % str(assigner_visual.values))
     if assigner_text is not None:
         logging.info("Assigned text values = %s" % str(assigner_text.values))
     return assigner_visual, assigner_text
@@ -286,8 +278,7 @@ def get_all_parameters(args, model):
 
 def print_optim(optimizer):
     for param_group in optimizer._param_groups:
-        print(param_group["group"], param_group["learning_rate"],
-              param_group["lr_scale"])
+        print(param_group["group"], param_group["learning_rate"], param_group["lr_scale"])
 
 
 def create_optimizer(args, model, lr_scheduler=None, return_params=False):
diff --git a/paddlemix/processors/base_processing.py b/paddlemix/processors/base_processing.py
index ae9bf62d18a8b..bdaa5d6f583b6 100644
--- a/paddlemix/processors/base_processing.py
+++ b/paddlemix/processors/base_processing.py
@@ -43,24 +43,22 @@ def __init__(self, *args, **kwargs):
                 raise TypeError(f"Unexepcted keyword argument {key}.")
         for arg, attribute_name in zip(args, self.attributes):
             if attribute_name in kwargs:
-                raise TypeError(
-                    f"Got multiple values for argument {attribute_name}.")
+                raise TypeError(f"Got multiple values for argument {attribute_name}.")
             else:
                 kwargs[attribute_name] = arg
 
         if len(kwargs) != len(self.attributes):
             raise ValueError(
                 f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
-                f"{len(args)} arguments instead.")
+                f"{len(args)} arguments instead."
+            )
 
         # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
         for attribute_name, arg in kwargs.items():
             setattr(self, attribute_name, arg)
 
     def __repr__(self):
-        attributes_repr = [
-            f"- {name}: {repr(getattr(self, name))}" for name in self.attributes
-        ]
+        attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}"
 
@@ -122,13 +120,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
                 [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
         """
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path,
-                                                  **kwargs)
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(*args)
 
     @classmethod
-    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path,
-                                       **kwargs):
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         args = []
         for attribute_name in cls.attributes:
             class_name = getattr(cls, f"{attribute_name}_class")
@@ -136,9 +132,7 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path,
             attribute_class = getattr(paddlemix.processors, class_name, None)
             if attribute_class is None:
                 attribute_class = getattr(paddlenlp.transformers, class_name)
-            args.append(
-                attribute_class.from_pretrained(pretrained_model_name_or_path,
-                                                **kwargs))
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
         return args
 
     @property
diff --git a/paddlemix/processors/blip_processing.py b/paddlemix/processors/blip_processing.py
index f74eaf37fc175..ff190c0b94be6 100644
--- a/paddlemix/processors/blip_processing.py
+++ b/paddlemix/processors/blip_processing.py
@@ -21,17 +21,33 @@
 import numpy as np
 import PIL
 from paddlenlp.transformers.tokenizer_utils_base import (
-    BatchEncoding, PreTokenizedInput, TensorType, TextInput)
+    BatchEncoding,
+    PreTokenizedInput,
+    TensorType,
+    TextInput,
+)
 
 from .base_processing import ProcessorMixin
 from .image_transform_utils import (
-    convert_to_rgb, normalize, random_horizontal_flip, random_resized_crop,
-    rescale, resize, to_channel_dimension_format)
-from .image_utils import (IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD,
-                          ChannelDimension, ImageInput, PILImageResampling,
-                          load_image, to_numpy_array, valid_images)
-from .processing_utils import (BaseImageProcessor, BaseTextProcessor,
-                               get_size_dict)
+    convert_to_rgb,
+    normalize,
+    random_horizontal_flip,
+    random_resized_crop,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from .image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    load_image,
+    to_numpy_array,
+    valid_images,
+)
+from .processing_utils import BaseImageProcessor, BaseTextProcessor, get_size_dict
 
 __all__ = [
     "Blip2Processor",
@@ -60,14 +76,14 @@ def __init__(self, image_processor, text_processor, tokenizer):
         super().__init__(image_processor, text_processor, tokenizer)
 
     def __call__(
-            self,
-            images=None,
-            text: Union[TextInput, PreTokenizedInput, List[TextInput], List[
-                PreTokenizedInput]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=None,
-            max_length=32,
-            mode="train",
-            **kwargs, ) -> BatchEncoding:
+        self,
+        images=None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        max_length=32,
+        mode="train",
+        **kwargs,
+    ) -> BatchEncoding:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to Bert's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -116,12 +132,12 @@ def __call__(
                 return_token_type_ids=False,
                 max_length=32,
                 padding=True,
-                **kwargs, )
+                **kwargs,
+            )
             return text_encoding
 
         # add pixel_values
-        encoding_image_processor = self.image_processor(
-            images, return_tensors=return_tensors, mode=mode)
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors, mode=mode)
 
         if text is not None:
             text_encoding = self.text_processor(text, mode=mode)
@@ -131,7 +147,8 @@ def __call__(
                 padding="longest",
                 truncation=True,
                 max_length=max_length,
-                return_attention_mask=True)
+                return_attention_mask=True,
+            )
         else:
             text_encoding = None
             # eos_token_id = None
@@ -159,8 +176,7 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
 class BlipTextProcessor(BaseTextProcessor):
@@ -180,31 +196,31 @@ class BlipTextProcessor(BaseTextProcessor):
     """
 
     def __init__(
-            self,
-            prompt: str="",
-            do_caption: bool=False,
-            do_question: bool=False,
-            max_words: int=50,
-            **kwargs, ):
+        self,
+        prompt: str = "",
+        do_caption: bool = False,
+        do_question: bool = False,
+        max_words: int = 50,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         if do_question and do_caption:
-            raise ValueError(
-                "do_caption and do_question cannot be set at the same time.")
+            raise ValueError("do_caption and do_question cannot be set at the same time.")
         if not do_caption and not do_question:
-            raise ValueError(
-                "Either do_caption or do_question must be set to True.")
+            raise ValueError("Either do_caption or do_question must be set to True.")
         self.prompt = prompt
         self.do_caption = do_caption
         self.do_question = do_question
         self.max_words = max_words
 
     def __call__(
-            self,
-            text,
-            do_caption: Optional[bool]=None,
-            do_question: Optional[bool]=None,
-            mode: str="train",
-            **kwargs, ):
+        self,
+        text,
+        do_caption: Optional[bool] = None,
+        do_question: Optional[bool] = None,
+        mode: str = "train",
+        **kwargs,
+    ):
         """
         Preprocess the text before tokenization.
 
@@ -222,11 +238,9 @@ def __call__(
         do_caption = do_caption if do_caption is not None else self.do_caption
         do_question = do_question if do_question is not None else self.do_question
         if do_caption and do_question:
-            raise ValueError(
-                "do_caption and do_question cannot be set at the same time.")
+            raise ValueError("do_caption and do_question cannot be set at the same time.")
         if not do_caption and not do_question:
-            raise ValueError(
-                "Either do_caption or do_question must be set to True.")
+            raise ValueError("Either do_caption or do_question must be set to True.")
 
         if not isinstance(text, (list, tuple)):
             text = [text]
@@ -246,18 +260,20 @@ def pre_caption(self, caption: str) -> str:
         caption = re.sub(
             r"([.!\"()*#:;~])",
             " ",
-            caption.lower(), )
+            caption.lower(),
+        )
         caption = re.sub(
             r"\s{2,}",
             " ",
-            caption, )
+            caption,
+        )
         caption = caption.rstrip("\n")
         caption = caption.strip(" ")
 
         # truncate caption
         caption_words = caption.split(" ")
         if len(caption_words) > self.max_words:
-            caption = " ".join(caption_words[:self.max_words])
+            caption = " ".join(caption_words[: self.max_words])
 
         return caption
 
@@ -268,13 +284,14 @@ def pre_question(self, question: str) -> str:
         question = re.sub(
             r"([.!\"()*#:;~])",
             "",
-            question.lower(), )
+            question.lower(),
+        )
         question = question.rstrip(" ")
 
         # truncate question
         question_words = question.split(" ")
         if len(question_words) > self.max_words:
-            question = " ".join(question_words[:self.max_words])
+            question = " ".join(question_words[: self.max_words])
 
         return question
 
@@ -325,23 +342,24 @@ class BlipImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool=True,
-            size: Dict[str, int]=None,
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            do_rescale: bool=True,
-            rescale_factor: Union[int, float]=1 / 255,
-            do_normalize: bool=True,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            do_convert_rgb: bool=True,
-            do_flip: bool=False,
-            flip_prob: float=0.5,
-            do_rand_resize_crop: bool=False,
-            scale: Optional[Union[List[float], Tuple[float]]]=(0.08, 1.0),
-            do_collate: bool=False,
-            mode: str="train",
-            **kwargs, ) -> None:
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_flip: bool = False,
+        flip_prob: float = 0.5,
+        do_rand_resize_crop: bool = False,
+        scale: Optional[Union[List[float], Tuple[float]]] = (0.08, 1.0),
+        do_collate: bool = False,
+        mode: str = "train",
+        **kwargs,
+    ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 384, "width": 384}
         size = get_size_dict(size, default_to_square=True)
@@ -352,8 +370,7 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = (image_mean if image_mean is not None else
-                           IMAGENET_STANDARD_MEAN)
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.do_convert_rgb = do_convert_rgb
         self.do_flip = do_flip
@@ -363,12 +380,13 @@ def __init__(
         self.do_collate = do_collate
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Resize an image.
 
@@ -393,14 +411,16 @@ def resize(
             size=output_size,
             resample=resample,
             data_format=data_format,
-            **kwargs, )
+            **kwargs,
+        )
 
     def rescale(
-            self,
-            image: np.ndarray,
-            scale: Union[int, float],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ):
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
         """
         Rescale an image by a scale factor. image = image * scale.
 
@@ -415,12 +435,13 @@ def rescale(
         return rescale(image, scale=scale, data_format=data_format, **kwargs)
 
     def normalize(
-            self,
-            image: np.ndarray,
-            mean: Union[float, List[float]],
-            std: Union[float, List[float]],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
 
@@ -434,16 +455,16 @@ def normalize(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        return normalize(
-            image, mean=mean, std=std, data_format=data_format, **kwargs)
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
     def random_resized_crop(
-            self,
-            image: np.ndarray,
-            size: Union[int, List, Tuple],
-            scale: float,
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        size: Union[int, List, Tuple],
+        scale: float,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Crop the input data to random size and aspect ratio.
         A crop of random size (default: of 0.08 to 1.0) of the original size and a random
@@ -461,13 +482,9 @@ def random_resized_crop(
                 Resampling filter to use when resiizing the image.
         """
         size = list(size.values())
-        return random_resized_crop(
-            image, size=size, scale=scale, resample=resample, **kwargs)
+        return random_resized_crop(image, size=size, scale=scale, resample=resample, **kwargs)
 
-    def random_horizontal_flip(self,
-                               image: np.ndarray,
-                               flip_prob: float,
-                               **kwargs) -> np.ndarray:
+    def random_horizontal_flip(self, image: np.ndarray, flip_prob: float, **kwargs) -> np.ndarray:
         """
         Horizontally flip the input data randomly with a given probability.
 
@@ -480,25 +497,26 @@ def random_horizontal_flip(self,
         return random_horizontal_flip(image, flip_prob=flip_prob, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            do_resize: Optional[bool]=None,
-            size: Optional[Dict[str, int]]=None,
-            resample: PILImageResampling=None,
-            do_rescale: Optional[bool]=None,
-            rescale_factor: Optional[float]=None,
-            do_normalize: Optional[bool]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=None,
-            do_convert_rgb: bool=None,
-            do_flip: bool=None,
-            flip_prob: float=None,
-            do_rand_resize_crop: bool=None,
-            scale: Optional[Union[List[float], Tuple[float]]]=None,
-            data_format: ChannelDimension=ChannelDimension.FIRST,
-            mode: str=None,
-            **kwargs, ) -> PIL.Image.Image:
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        do_flip: bool = None,
+        flip_prob: float = None,
+        do_rand_resize_crop: bool = None,
+        scale: Optional[Union[List[float], Tuple[float]]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        mode: str = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
@@ -546,19 +564,15 @@ def preprocess(
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (rescale_factor if rescale_factor is not None else
-                          self.rescale_factor)
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else
-                          self.do_convert_rgb)
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         do_flip = do_flip if do_flip is not None else self.do_flip
         flip_prob = flip_prob if flip_prob is not None else self.flip_prob
         scale = scale if scale is not None else self.scale
-        do_rand_resize_crop = (do_rand_resize_crop
-                               if do_rand_resize_crop is not None else
-                               self.do_rand_resize_crop)
+        do_rand_resize_crop = do_rand_resize_crop if do_rand_resize_crop is not None else self.do_rand_resize_crop
 
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
@@ -570,30 +584,22 @@ def preprocess(
             images = [load_image(image) for image in images]
 
         if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "paddle.Tensor.")
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
         if do_resize and size is None or resample is None:
-            raise ValueError(
-                "Size and resample must be specified if do_resize is True.")
+            raise ValueError("Size and resample must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
-            raise ValueError(
-                "Rescale factor must be specified if do_rescale is True.")
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
 
         if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError(
-                "Image mean and std must be specified if do_normalize is True.")
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
         if do_flip and flip_prob is None:
-            raise ValueError(
-                "Flip probability must be specified if do_flip is True.")
+            raise ValueError("Flip probability must be specified if do_flip is True.")
 
         if do_rand_resize_crop and scale is None:
-            raise ValueError(
-                "Random resize crop probability must be specified if do_rand_resize_crop is True."
-            )
+            raise ValueError("Random resize crop probability must be specified if do_rand_resize_crop is True.")
 
         # PIL RGBA images are converted to RGB
         if do_convert_rgb:
@@ -603,39 +609,21 @@ def preprocess(
         images = [to_numpy_array(image) for image in images]
         if do_rand_resize_crop and mode == "train":
             images = [
-                self.random_resized_crop(
-                    image=image, size=size, scale=scale, resample=resample)
-                for image in images
+                self.random_resized_crop(image=image, size=size, scale=scale, resample=resample) for image in images
             ]
         elif do_resize and mode != "train":
-            images = [
-                self.resize(
-                    image=image, size=size, resample=resample)
-                for image in images
-            ]
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
 
         if do_flip and mode == "train":
-            images = [
-                self.random_horizontal_flip(
-                    image=image, flip_prob=flip_prob) for image in images
-            ]
+            images = [self.random_horizontal_flip(image=image, flip_prob=flip_prob) for image in images]
 
         if do_rescale:
-            images = [
-                self.rescale(
-                    image=image, scale=rescale_factor) for image in images
-            ]
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
 
         if do_normalize:
-            images = [
-                self.normalize(
-                    image=image, mean=image_mean, std=image_std)
-                for image in images
-            ]
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
 
-        images = [
-            to_channel_dimension_format(image, data_format) for image in images
-        ]
+        images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
-        return BatchEncoding(data=data, tensor_type=return_tensors)
\ No newline at end of file
+        return BatchEncoding(data=data, tensor_type=return_tensors)
diff --git a/paddlemix/processors/clip_processing.py b/paddlemix/processors/clip_processing.py
index 358719d9ff002..3f095f6bd85e7 100644
--- a/paddlemix/processors/clip_processing.py
+++ b/paddlemix/processors/clip_processing.py
@@ -22,17 +22,29 @@
 import PIL
 from paddle.vision.transforms import functional as F
 from paddlenlp.transformers.tokenizer_utils_base import (
-    BatchEncoding, PreTokenizedInput, TensorType, TextInput)
+    BatchEncoding,
+    PreTokenizedInput,
+    TensorType,
+    TextInput,
+)
 
 from .base_processing import ProcessorMixin
 from .image_transform_utils import (
-    convert_to_rgb, normalize, random_horizontal_flip, random_resized_crop,
-    rescale, resize, to_channel_dimension_format)
-from .image_utils import (IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD,
-                          ChannelDimension, ImageInput, PILImageResampling,
-                          load_image, to_numpy_array, valid_images)
-from .processing_utils import (BaseImageProcessor, BaseTextProcessor,
-                               get_size_dict)
+    convert_to_rgb,
+    random_horizontal_flip,
+    random_resized_crop,
+    rescale,
+)
+from .image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    load_image,
+    valid_images,
+)
+from .processing_utils import BaseImageProcessor, BaseTextProcessor, get_size_dict
 
 __all__ = [
     "CLIPProcessor",
@@ -61,14 +73,14 @@ def __init__(self, image_processor, text_processor, tokenizer):
         super().__init__(image_processor, text_processor, tokenizer)
 
     def __call__(
-            self,
-            images=None,
-            text: Union[TextInput, PreTokenizedInput, List[TextInput], List[
-                PreTokenizedInput]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=None,
-            max_length=77,
-            mode="train",
-            **kwargs, ) -> BatchEncoding:
+        self,
+        images=None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        max_length=77,
+        mode="train",
+        **kwargs,
+    ) -> BatchEncoding:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to Bert's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -109,18 +121,17 @@ def __call__(
             raise ValueError("You have to specify either images or text.")
 
         # images PIL list
-        encoding_image_processor = self.image_processor(
-            images, return_tensors=return_tensors, mode=mode)
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors, mode=mode)
 
-        text_encoding = self.text_processor(
-            text, mode=mode)  # text preprocessor before tokenizer
+        text_encoding = self.text_processor(text, mode=mode)  # text preprocessor before tokenizer
         text_encoding = self.tokenizer(
             text=text_encoding,
             return_tensors=return_tensors,
             return_token_type_ids=False,
             max_length=max_length,
             padding=True,
-            **kwargs, )
+            **kwargs,
+        )
 
         for key, value in text_encoding.items():
             shape = value.shape
@@ -133,8 +144,7 @@ def __call__(
                     fill_value = 0
                 newshape = shape
                 newshape[-1] = max_length - shape[-1]
-                padtensor = paddle.full(
-                    shape=newshape, fill_value=fill_value, dtype=value.dtype)
+                padtensor = paddle.full(shape=newshape, fill_value=fill_value, dtype=value.dtype)
                 newvalue = paddle.concat([value, padtensor], axis=-1)
                 text_encoding[key] = newvalue
 
@@ -163,8 +173,7 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
 class CLIPTextProcessor(BaseTextProcessor):
@@ -181,19 +190,21 @@ class CLIPTextProcessor(BaseTextProcessor):
     """
 
     def __init__(
-            self,
-            prompt: str="",
-            max_words: int=77,
-            **kwargs, ):
+        self,
+        prompt: str = "",
+        max_words: int = 77,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self.prompt = prompt
         self.max_words = max_words
 
     def __call__(
-            self,
-            text,
-            mode: str="train",
-            **kwargs, ):
+        self,
+        text,
+        mode: str = "train",
+        **kwargs,
+    ):
         """
         Preprocess the text before tokenization.
 
@@ -218,18 +229,20 @@ def pre_caption(self, caption: str) -> str:
         caption = re.sub(
             r"([.!\"()*#:;~])",
             " ",
-            caption.lower(), )
+            caption.lower(),
+        )
         caption = re.sub(
             r"\s{2,}",
             " ",
-            caption, )
+            caption,
+        )
         caption = caption.rstrip("\n")
         caption = caption.strip(" ")
 
         # truncate caption
         caption_words = caption.split(" ")
         if len(caption_words) > self.max_words:
-            caption = " ".join(caption_words[:self.max_words])
+            caption = " ".join(caption_words[: self.max_words])
 
         return caption
 
@@ -280,23 +293,24 @@ class CLIPImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool=True,
-            size: Dict[str, int]=None,
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            do_rescale: bool=True,
-            rescale_factor: Union[int, float]=1 / 255,
-            do_normalize: bool=True,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            do_convert_rgb: bool=True,
-            do_flip: bool=False,
-            flip_prob: float=0.5,
-            do_rand_resize_crop: bool=False,
-            scale: Optional[Union[List[float], Tuple[float]]]=(0.9, 1.0),
-            do_collate: bool=False,
-            mode: str="train",
-            **kwargs, ) -> None:
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_flip: bool = False,
+        flip_prob: float = 0.5,
+        do_rand_resize_crop: bool = False,
+        scale: Optional[Union[List[float], Tuple[float]]] = (0.9, 1.0),
+        do_collate: bool = False,
+        mode: str = "train",
+        **kwargs,
+    ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 384, "width": 384}
         size = get_size_dict(size, default_to_square=True)
@@ -307,8 +321,7 @@ def __init__(
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
         self.do_normalize = do_normalize
-        self.image_mean = (image_mean if image_mean is not None else
-                           IMAGENET_STANDARD_MEAN)
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.do_convert_rgb = do_convert_rgb
         self.do_flip = do_flip
@@ -318,12 +331,13 @@ def __init__(
         self.do_collate = do_collate
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Resize an image.
 
@@ -348,14 +362,16 @@ def resize(
             size=output_size,
             resample=resample,
             data_format=data_format,
-            **kwargs, )
+            **kwargs,
+        )
 
     def rescale(
-            self,
-            image: np.ndarray,
-            scale: Union[int, float],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ):
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
         """
         Rescale an image by a scale factor. image = image * scale.
 
@@ -370,23 +386,24 @@ def rescale(
         return rescale(image, scale=scale, data_format=data_format, **kwargs)
 
     def normalize(
-            self,
-            image: paddle.Tensor,
-            mean: Union[float, List[float]],
-            std: Union[float, List[float]],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
-        tensor_normalize = paddle.vision.transforms.Normalize(
-            mean=mean, std=std, data_format=data_format, **kwargs)
+        self,
+        image: paddle.Tensor,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        tensor_normalize = paddle.vision.transforms.Normalize(mean=mean, std=std, data_format=data_format, **kwargs)
         return tensor_normalize(image)
 
     def random_resized_crop(
-            self,
-            image: np.ndarray,
-            size: Union[int, List, Tuple],
-            scale: float,
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        size: Union[int, List, Tuple],
+        scale: float,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Crop the input data to random size and aspect ratio.
         A crop of random size (default: of 0.08 to 1.0) of the original size and a random
@@ -404,13 +421,9 @@ def random_resized_crop(
                 Resampling filter to use when resiizing the image.
         """
         size = list(size.values())
-        return random_resized_crop(
-            image, size=size, scale=scale, resample=resample, **kwargs)
+        return random_resized_crop(image, size=size, scale=scale, resample=resample, **kwargs)
 
-    def random_horizontal_flip(self,
-                               image: np.ndarray,
-                               flip_prob: float,
-                               **kwargs) -> np.ndarray:
+    def random_horizontal_flip(self, image: np.ndarray, flip_prob: float, **kwargs) -> np.ndarray:
         """
         Horizontally flip the input data randomly with a given probability.
 
@@ -423,25 +436,26 @@ def random_horizontal_flip(self,
         return random_horizontal_flip(image, flip_prob=flip_prob, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            do_resize: Optional[bool]=None,
-            size: Optional[Dict[str, int]]=None,
-            resample: PILImageResampling=None,
-            do_rescale: Optional[bool]=None,
-            rescale_factor: Optional[float]=None,
-            do_normalize: Optional[bool]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=None,
-            do_convert_rgb: bool=None,
-            do_flip: bool=None,
-            flip_prob: float=None,
-            do_rand_resize_crop: bool=None,
-            scale: Optional[Union[List[float], Tuple[float]]]=None,
-            data_format: ChannelDimension=ChannelDimension.FIRST,
-            mode: str=None,
-            **kwargs, ) -> PIL.Image.Image:
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        do_flip: bool = None,
+        flip_prob: float = None,
+        do_rand_resize_crop: bool = None,
+        scale: Optional[Union[List[float], Tuple[float]]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        mode: str = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
@@ -489,19 +503,15 @@ def preprocess(
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (rescale_factor if rescale_factor is not None else
-                          self.rescale_factor)
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else
-                          self.do_convert_rgb)
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         do_flip = do_flip if do_flip is not None else self.do_flip
         flip_prob = flip_prob if flip_prob is not None else self.flip_prob
         scale = scale if scale is not None else self.scale
-        do_rand_resize_crop = (do_rand_resize_crop
-                               if do_rand_resize_crop is not None else
-                               self.do_rand_resize_crop)
+        do_rand_resize_crop = do_rand_resize_crop if do_rand_resize_crop is not None else self.do_rand_resize_crop
 
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
@@ -513,54 +523,34 @@ def preprocess(
             images = [load_image(image) for image in images]
 
         if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "paddle.Tensor.")
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
         if do_resize and size is None or resample is None:
-            raise ValueError(
-                "Size and resample must be specified if do_resize is True.")
+            raise ValueError("Size and resample must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
-            raise ValueError(
-                "Rescale factor must be specified if do_rescale is True.")
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
 
         if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError(
-                "Image mean and std must be specified if do_normalize is True.")
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
         if do_flip and flip_prob is None:
-            raise ValueError(
-                "Flip probability must be specified if do_flip is True.")
+            raise ValueError("Flip probability must be specified if do_flip is True.")
 
         if do_rand_resize_crop and scale is None:
-            raise ValueError(
-                "Random resize crop probability must be specified if do_rand_resize_crop is True."
-            )
+            raise ValueError("Random resize crop probability must be specified if do_rand_resize_crop is True.")
         if do_rand_resize_crop and mode == "train":
             images = [
-                self.random_resized_crop(
-                    image=image, size=size, scale=scale, resample=resample)
-                for image in images
+                self.random_resized_crop(image=image, size=size, scale=scale, resample=resample) for image in images
             ]
         elif do_resize and mode != "train":
-            images = [
-                self.resize(
-                    image=image, size=size, resample=resample)
-                for image in images
-            ]
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
 
         if do_flip and mode == "train":
-            images = [
-                self.random_horizontal_flip(
-                    image=image, flip_prob=flip_prob) for image in images
-            ]
+            images = [self.random_horizontal_flip(image=image, flip_prob=flip_prob) for image in images]
 
         if do_rescale:
-            images = [
-                self.rescale(
-                    image=image, scale=rescale_factor) for image in images
-            ]
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
         if do_normalize:
             images = [convert_to_rgb(image) for image in images]
             images = [np.array(image, "float32") for image in images]
@@ -571,24 +561,25 @@ def preprocess(
             batch_images["image"] / 255.0,
             mean=image_mean,
             std=image_std,
-            data_format="CHW", )
+            data_format="CHW",
+        )
         return {"image": image}
 
-    def preprocess_fixed(
-            self, images: ImageInput,
-            size: Optional[Dict[str, int]]=None) -> PIL.Image.Image:
+    def preprocess_fixed(self, images: ImageInput, size: Optional[Dict[str, int]] = None) -> PIL.Image.Image:
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
 
-        processor = paddle.vision.transforms.Compose([
-            paddle.vision.transforms.RandomResizedCrop(
-                [224, 224], scale=(1.0, 1.0), interpolation="bicubic"),
-            _convert_to_rgb,
-            paddle.vision.transforms.ToTensor(),
-            paddle.vision.transforms.Normalize(
-                mean=[0.48145466, 0.4578275, 0.40821073],
-                std=[0.26862954, 0.26130258, 0.27577711], ),
-        ])
+        processor = paddle.vision.transforms.Compose(
+            [
+                paddle.vision.transforms.RandomResizedCrop([224, 224], scale=(1.0, 1.0), interpolation="bicubic"),
+                _convert_to_rgb,
+                paddle.vision.transforms.ToTensor(),
+                paddle.vision.transforms.Normalize(
+                    mean=[0.48145466, 0.4578275, 0.40821073],
+                    std=[0.26862954, 0.26130258, 0.27577711],
+                ),
+            ]
+        )
         inputs = []
         for inp in images:
             inputs.append(processor(inp).unsqueeze(0))
@@ -618,8 +609,7 @@ def forward(self, img):
         scale = self.max_size / float(max(height, width))
         if scale != 1.0:
             new_size = tuple(round(dim * scale) for dim in (height, width))
-            img = paddle.vision.transforms.resize(img, new_size,
-                                                  self.interpolation)
+            img = paddle.vision.transforms.resize(img, new_size, self.interpolation)
             pad_h = self.max_size - new_size[0]
             pad_w = self.max_size - new_size[1]
             img = paddle.vision.transforms.pad(
@@ -630,41 +620,42 @@ def forward(self, img):
                     pad_w - pad_w // 2,
                     pad_h - pad_h // 2,
                 ],
-                fill=self.fill, )
+                fill=self.fill,
+            )
         return img
 
 
 def image_transform(
-        image_size: int,
-        is_train: bool,
-        mean: Optional[Tuple[float, ...]]=(0.48145466, 0.4578275, 0.40821073),
-        std: Optional[Tuple[float, ...]]=(0.26862954, 0.26130258, 0.27577711),
-        resize_longest_max: bool=False,
-        fill_color: int=0, ):
+    image_size: int,
+    is_train: bool,
+    mean: Optional[Tuple[float, ...]] = (0.48145466, 0.4578275, 0.40821073),
+    std: Optional[Tuple[float, ...]] = (0.26862954, 0.26130258, 0.27577711),
+    resize_longest_max: bool = False,
+    fill_color: int = 0,
+):
     if not isinstance(mean, (list, tuple)):
-        mean = (mean, ) * 3
+        mean = (mean,) * 3
     if not isinstance(std, (list, tuple)):
-        std = (std, ) * 3
+        std = (std,) * 3
     if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
         image_size = image_size[0]
     normalize = paddle.vision.transforms.Normalize(mean=mean, std=std)
     if is_train:
-        return paddle.vision.transforms.Compose([
-            paddle.vision.transforms.RandomResizedCrop(
-                image_size, scale=(1.0, 1.0), interpolation="bicubic"),
-            _convert_to_rgb,
-            paddle.vision.transforms.ToTensor(),
-            normalize,
-        ])
+        return paddle.vision.transforms.Compose(
+            [
+                paddle.vision.transforms.RandomResizedCrop(image_size, scale=(1.0, 1.0), interpolation="bicubic"),
+                _convert_to_rgb,
+                paddle.vision.transforms.ToTensor(),
+                normalize,
+            ]
+        )
     else:
         if resize_longest_max:
             transforms = [ResizeMaxSize(image_size, fill=fill_color)]
         else:
             transforms = [
-                paddle.vision.transforms.Resize(
-                    image_size, interpolation="bicubic"),
+                paddle.vision.transforms.Resize(image_size, interpolation="bicubic"),
                 paddle.vision.transforms.CenterCrop(image_size),
             ]
-        transforms.extend(
-            [_convert_to_rgb, paddle.vision.transforms.ToTensor(), normalize])
+        transforms.extend([_convert_to_rgb, paddle.vision.transforms.ToTensor(), normalize])
         return paddle.vision.transforms.Compose(transforms)
diff --git a/paddlemix/processors/groundingdino_processing.py b/paddlemix/processors/groundingdino_processing.py
index 6e15c880bd9dc..7d1d4b48fa674 100644
--- a/paddlemix/processors/groundingdino_processing.py
+++ b/paddlemix/processors/groundingdino_processing.py
@@ -15,20 +15,14 @@
 Processor class for GroundingDino.
 """
 
-import re
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Union
 
-import numpy as np
 import paddle
 import paddle.vision.transforms as T
-import PIL
 from paddlenlp.taskflow.utils import pad_batch_data
-from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding,
-                                                         TensorType, TextInput)
 
 from .base_processing import ProcessorMixin
-from .image_utils import (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD,
-                          valid_images)
+from .image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, valid_images
 from .processing_utils import BaseImageProcessor, BaseTextProcessor
 from .utils import _max_by_axis
 
@@ -50,18 +44,18 @@ def __init__(self, image_processor, text_processor, tokenizer):
         super().__init__(image_processor, text_processor, tokenizer)
 
     def __call__(
-            self,
-            images=None,
-            text: str=None,
-            **kwargs, ):
+        self,
+        images=None,
+        text: str = None,
+        **kwargs,
+    ):
 
         if images is None or text is None:
             raise ValueError("You have to specify either images and text.")
 
         self.prompt = self.text_processor.pre_caption(text)
         input_ids = self.tokenizer([self.prompt]).input_ids
-        specical_tokens = self.tokenizer.convert_tokens_to_ids(
-            ["[CLS]", "[SEP]", ".", "?"])
+        specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
         tokenized_out = self.text_processor(input_ids, specical_tokens)
 
         image_tensor, mask = self.image_processor(images)
@@ -86,8 +80,7 @@ def decode(self, posmap):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
 class GroudingDinoTextProcessor(BaseTextProcessor):
@@ -96,19 +89,21 @@ class GroudingDinoTextProcessor(BaseTextProcessor):
     """
 
     def __init__(
-            self,
-            max_words: int=256,
-            **kwargs, ):
+        self,
+        max_words: int = 256,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         self.max_words = max_words
         self.caption = None
 
     def __call__(
-            self,
-            input_ids,
-            special_tokens_list,
-            **kwargs, ):
+        self,
+        input_ids,
+        special_tokens_list,
+        **kwargs,
+    ):
         """
         Preprocess the text with tokenization.
         """
@@ -116,25 +111,19 @@ def __call__(
         input_ids = pad_batch_data(input_ids)
         input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64).squeeze(-1)
         tokenized_out["input_ids"] = input_ids
-        tokenized_out["attention_mask"] = paddle.cast(input_ids != 0,
-                                                      paddle.int64)
+        tokenized_out["attention_mask"] = paddle.cast(input_ids != 0, paddle.int64)
 
         (
             text_self_attention_masks,
             position_ids,
             cate_to_token_mask_list,
-        ) = self.generate_masks_with_special_tokens_and_transfer_map(
-            tokenized_out, special_tokens_list)
+        ) = self.generate_masks_with_special_tokens_and_transfer_map(tokenized_out, special_tokens_list)
 
         if text_self_attention_masks.shape[1] > self.max_words:
-            text_self_attention_masks = text_self_attention_masks[:, :self.
-                                                                  max_words, :
-                                                                  self.max_words]
-            position_ids = position_ids[:, :self.max_words]
-            tokenized_out["input_ids"] = tokenized_out[
-                "input_ids"][:, :self.max_words]
-            tokenized_out["attention_mask"] = tokenized_out[
-                "attention_mask"][:, :self.max_words]
+            text_self_attention_masks = text_self_attention_masks[:, : self.max_words, : self.max_words]
+            position_ids = position_ids[:, : self.max_words]
+            tokenized_out["input_ids"] = tokenized_out["input_ids"][:, : self.max_words]
+            tokenized_out["attention_mask"] = tokenized_out["attention_mask"][:, : self.max_words]
         tokenized_out["position_ids"] = position_ids
         tokenized_out["text_self_attention_masks"] = text_self_attention_masks
 
@@ -150,8 +139,7 @@ def pre_caption(self, caption: str) -> str:
         self.caption = caption
         return caption
 
-    def generate_masks_with_special_tokens_and_transfer_map(
-            self, tokenized, special_tokens_list):
+    def generate_masks_with_special_tokens_and_transfer_map(self, tokenized, special_tokens_list):
         """Generate attention mask between each pair of special tokens
         Args:
             input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
@@ -170,8 +158,7 @@ def generate_masks_with_special_tokens_and_transfer_map(
         idxs = paddle.nonzero(special_tokens_mask)
 
         # generate attention mask and positional ids
-        attention_mask = (paddle.eye(num_token, dtype=paddle.int32)
-                          .cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1]))
+        attention_mask = paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])
         position_ids = paddle.zeros((bs, num_token), dtype=paddle.int64)
         cate_to_token_mask_list = [[] for _ in range(bs)]
         previous_col = 0
@@ -182,17 +169,18 @@ def generate_masks_with_special_tokens_and_transfer_map(
                 attention_mask[row, col, col] = True
                 position_ids[row, col] = 0
             else:
-                attention_mask[row, previous_col + 1:col + 1, previous_col + 1:
-                               col + 1] = True
-                position_ids[row, previous_col + 1:col + 1] = paddle.arange(
-                    0, col - previous_col)
-                c2t_maski = paddle.zeros([num_token, ]).cast(paddle.bool)
-                c2t_maski[previous_col + 1:col] = True
+                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+                position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
+                c2t_maski = paddle.zeros(
+                    [
+                        num_token,
+                    ]
+                ).cast(paddle.bool)
+                c2t_maski[previous_col + 1 : col] = True
                 cate_to_token_mask_list[row].append(c2t_maski)
             previous_col = col
 
-        return attention_mask, position_ids.cast(
-            paddle.int64), cate_to_token_mask_list
+        return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
 
 
 class GroudingDinoImageProcessor(BaseImageProcessor):
@@ -203,22 +191,22 @@ class GroudingDinoImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool=True,
-            size: List[int]=None,
-            do_normalize: bool=True,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            do_nested: bool=True,
-            **kwargs, ) -> None:
+        self,
+        do_resize: bool = True,
+        size: List[int] = None,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_nested: bool = True,
+        **kwargs,
+    ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else 800
 
         self.do_resize = do_resize
         self.size = size
         self.do_normalize = do_normalize
-        self.image_mean = (image_mean if image_mean is not None else
-                           IMAGENET_STANDARD_MEAN)
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.do_nested = do_nested
 
@@ -229,8 +217,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None):
                 min_original_size = float(min((w, h)))
                 max_original_size = float(max((w, h)))
                 if max_original_size / min_original_size * size > max_size:
-                    size = int(
-                        round(max_size * min_original_size / max_original_size))
+                    size = int(round(max_size * min_original_size / max_original_size))
 
             if (w <= h and w == size) or (h <= w and h == size):
                 return (h, w)
@@ -256,16 +243,13 @@ def get_size(image_size, size, max_size=None):
         if target is None:
             return rescaled_image
 
-        ratios = tuple(
-            float(s) / float(s_orig)
-            for s, s_orig in zip(rescaled_image.size, image.size))
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
         ratio_width, ratio_height = ratios
 
         target = target.copy()
         if "boxes" in target:
             boxes = target["boxes"]
-            scaled_boxes = boxes * paddle.to_tensor(
-                [ratio_width, ratio_height, ratio_width, ratio_height])
+            scaled_boxes = boxes * paddle.to_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
             target["boxes"] = scaled_boxes
 
         if "area" in target:
@@ -276,11 +260,10 @@ def get_size(image_size, size, max_size=None):
         h, w = size
         target["size"] = paddle.to_tensor([h, w])
 
-        if "masks" in target:
-            target["masks"] = (interpolate(
-                target["masks"][:, None].cast(paddle.float32),
-                size,
-                mode="nearest")[:, 0] > 0.5)
+        # if "masks" in target:
+        #     target["masks"] = (
+        #         interpolate(target["masks"][:, None].cast(paddle.float32), size, mode="nearest")[:, 0] > 0.5
+        #     )
 
         return rescaled_image, target
 
@@ -298,22 +281,23 @@ def nested_tensor_from_tensor_list(self, tensor_list: List[paddle.Tensor]):
             mask = paddle.ones((b, h, w), dtype=paddle.bool)
             for i in range(b):
                 img = tensor_list[i]
-                tensor[i, :img.shape[0], :img.shape[1], :img.shape[2]] = img
-                mask[i, :img.shape[1], :img.shape[2]] = False
+                tensor[i, : img.shape[0], : img.shape[1], : img.shape[2]] = img
+                mask[i, : img.shape[1], : img.shape[2]] = False
         else:
             raise ValueError("not supported")
         return tensor, mask
 
     def preprocess(
-            self,
-            images,
-            do_resize: Optional[bool]=None,
-            size: Optional[Dict[str, int]]=None,
-            do_normalize: Optional[bool]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            do_nested: bool=None,
-            **kwargs, ):
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_nested: bool = None,
+        **kwargs,
+    ):
         """
         Preprocess an image or batch of images.
 
@@ -329,23 +313,17 @@ def preprocess(
         if not isinstance(images, (list, tuple)):
             images = [images]
 
-        if isinstance(images[0], str):
-            images = [load_image(image) for image in images]
+        # if isinstance(images[0], str):
+        #     images = [load_image(image) for image in images]
 
         if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "paddle.Tensor.")
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
         if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError(
-                "Image mean and std must be specified if do_normalize is True.")
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
         if do_resize:
-            images = [
-                T.to_tensor(self.resize(
-                    image=image, size=size)) for image in images
-            ]
+            images = [T.to_tensor(self.resize(image=image, size=size)) for image in images]
 
         if do_normalize:
             images = T.normalize(images, mean=image_mean, std=image_std)
diff --git a/paddlemix/processors/image_processing_utils.py b/paddlemix/processors/image_processing_utils.py
index 4cb4fb5343c27..0c8a0913dd8cd 100644
--- a/paddlemix/processors/image_processing_utils.py
+++ b/paddlemix/processors/image_processing_utils.py
@@ -19,16 +19,25 @@
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_download,
-                             hf_hub_url, repo_type_and_id_from_hf_id,
-                             upload_folder)
+from huggingface_hub import (
+    create_repo,
+    get_hf_file_metadata,
+    hf_hub_download,
+    hf_hub_url,
+    repo_type_and_id_from_hf_id,
+    upload_folder,
+)
 from huggingface_hub.utils import EntryNotFoundError
 from paddlenlp import __version__
-from paddlenlp.transformers.feature_extraction_utils import \
-    BatchFeature as BaseBatchFeature
+from paddlenlp.transformers.feature_extraction_utils import (
+    BatchFeature as BaseBatchFeature,
+)
 
 from paddlemix.utils.downloader import (
-    COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock, resolve_cache_dir)
+    COMMUNITY_MODEL_PREFIX,
+    get_path_from_url_with_filelock,
+    resolve_cache_dir,
+)
 from paddlemix.utils.log import logger
 
 IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json"
@@ -75,9 +84,7 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
         Instantiate a type of [`~processing_utils.ImageProcessingMixin`] from an image processor.
 
@@ -155,13 +162,11 @@ def from_pretrained(cls,
         assert image_processor.do_normalize is False
         assert unused_kwargs == {"foo": False}
         ```"""
-        image_processor_dict, kwargs = cls.get_image_processor_dict(
-            pretrained_model_name_or_path, **kwargs)
+        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(image_processor_dict, **kwargs)
 
-    def save_pretrained(self, save_directory: Union[str, os.PathLike],
-                        **kwargs):
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
         """
         Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
         [`~processing_utils.ImageProcessingMixin.from_pretrained`] class method.
@@ -173,15 +178,12 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike],
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
-            raise AssertionError(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
         os.makedirs(save_directory, exist_ok=True)
 
         # If we save using the predefined names, we can load using `from_pretrained`
-        output_image_processor_file = os.path.join(save_directory,
-                                                   IMAGE_PROCESSOR_NAME)
+        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
 
         self.to_json_file(output_image_processor_file)
         logger.info(f"Image processor saved in {output_image_processor_file}")
@@ -189,13 +191,14 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike],
         return [output_image_processor_file]
 
     def save_to_hf_hub(
-            self,
-            repo_id: str,
-            private: Optional[bool]=None,
-            subfolder: Optional[str]=None,
-            commit_message: Optional[str]=None,
-            revision: Optional[str]=None,
-            create_pr: bool=False, ):
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
         """
         Uploads all elements of this processor to a new HuggingFace Hub repository.
         Args:
@@ -220,9 +223,7 @@ def save_to_hf_hub(
 
         # Check if README file already exist in repo
         try:
-            get_hf_file_metadata(
-                hf_hub_url(
-                    repo_id=repo_id, filename="README.md", revision=revision))
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
             has_readme = True
         except EntryNotFoundError:
             has_readme = False
@@ -248,12 +249,13 @@ def save_to_hf_hub(
                 folder_path=root_dir,
                 commit_message=commit_message,
                 revision=revision,
-                create_pr=create_pr, )
+                create_pr=create_pr,
+            )
 
     @classmethod
     def get_image_processor_dict(
-            cls, pretrained_model_name_or_path: Union[str, os.PathLike],
-            **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
         image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
@@ -271,14 +273,12 @@ def get_image_processor_dict(
         cache_dir = kwargs.pop("cache_dir", None)
         from_hf_hub = kwargs.pop("from_hf_hub", False)
         subfolder = kwargs.pop("subfolder", None)
-        cache_dir = resolve_cache_dir(pretrained_model_name_or_path,
-                                      from_hf_hub, cache_dir)
+        cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
 
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         is_local = os.path.isdir(pretrained_model_name_or_path)
         if os.path.isdir(pretrained_model_name_or_path):
-            resolved_image_processor_file = os.path.join(
-                pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+            resolved_image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
         elif os.path.isfile(pretrained_model_name_or_path):
             resolved_image_processor_file = pretrained_model_name_or_path
             is_local = True
@@ -290,18 +290,20 @@ def get_image_processor_dict(
                 cache_dir=cache_dir,
                 subfolder=subfolder,
                 library_name="PaddleNLP",
-                library_version=__version__, )
+                library_version=__version__,
+            )
         else:
             # Assuming from community-contributed pretrained models
-            image_processor_file = "/".join([
-                COMMUNITY_MODEL_PREFIX,
-                pretrained_model_name_or_path,
-                IMAGE_PROCESSOR_NAME,
-            ])
+            image_processor_file = "/".join(
+                [
+                    COMMUNITY_MODEL_PREFIX,
+                    pretrained_model_name_or_path,
+                    IMAGE_PROCESSOR_NAME,
+                ]
+            )
             try:
                 # Load from local folder or from cache or download from model Hub and cache
-                resolved_image_processor_file = get_path_from_url_with_filelock(
-                    image_processor_file, cache_dir)
+                resolved_image_processor_file = get_path_from_url_with_filelock(image_processor_file, cache_dir)
             except EnvironmentError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
@@ -312,13 +314,12 @@ def get_image_processor_dict(
                     f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
                     " it from 'BOS', make sure you don't have a local directory with the"
                     f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                    f" directory containing a {IMAGE_PROCESSOR_NAME} file")
+                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
+                )
 
         try:
             # Load image_processor dict
-            with open(
-                    resolved_image_processor_file, "r",
-                    encoding="utf-8") as reader:
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
                 text = reader.read()
             image_processor_dict = json.loads(text)
 
@@ -328,8 +329,7 @@ def get_image_processor_dict(
             )
 
         if is_local:
-            logger.info(
-                f"loading configuration file {resolved_image_processor_file}")
+            logger.info(f"loading configuration file {resolved_image_processor_file}")
         else:
             logger.info(
                 f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
@@ -449,14 +449,14 @@ def __call__(self, images, **kwargs) -> BatchFeature:
         return self.preprocess(images, **kwargs)
 
     def preprocess(self, images, **kwargs) -> BatchFeature:
-        raise NotImplementedError(
-            "Each image processor must implement its own preprocess method")
+        raise NotImplementedError("Each image processor must implement its own preprocess method")
 
 
 VALID_SIZE_DICT_KEYS = (
     {"height", "width"},
     {"shortest_edge"},
-    {"shortest_edge", "longest_edge"}, )
+    {"shortest_edge", "longest_edge"},
+)
 
 
 def is_valid_size_dict(size_dict):
@@ -471,16 +471,15 @@ def is_valid_size_dict(size_dict):
 
 
 def convert_to_size_dict(
-        size,
-        max_size: Optional[int]=None,
-        default_to_square: bool=True,
-        height_width_order: bool=True, ):
+    size,
+    max_size: Optional[int] = None,
+    default_to_square: bool = True,
+    height_width_order: bool = True,
+):
     # By default, if size is an int we assume it represents a tuple of (size, size).
     if isinstance(size, int) and default_to_square:
         if max_size is not None:
-            raise ValueError(
-                "Cannot specify both size as an int, with default_to_square=True and max_size"
-            )
+            raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size")
         return {"height": size, "width": size}
     # In other configs, if size is an int and default_to_square is False, size represents the length of
     # the shortest edge after resizing.
@@ -499,11 +498,12 @@ def convert_to_size_dict(
 
 
 def get_size_dict(
-        size: Union[int, Iterable[int], Dict[str, int]]=None,
-        max_size: Optional[int]=None,
-        height_width_order: bool=True,
-        default_to_square: bool=True,
-        param_name="size", ) -> dict:
+    size: Union[int, Iterable[int], Dict[str, int]] = None,
+    max_size: Optional[int] = None,
+    height_width_order: bool = True,
+    default_to_square: bool = True,
+    param_name="size",
+) -> dict:
     """
     Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
     compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
@@ -526,11 +526,11 @@ def get_size_dict(
             If `size` is an int, whether to default to a square image or not.
     """
     if not isinstance(size, dict):
-        size_dict = convert_to_size_dict(size, max_size, default_to_square,
-                                         height_width_order)
+        size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order)
         logger.info(
             f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
-            f" Converted to {size_dict}.", )
+            f" Converted to {size_dict}.",
+        )
     else:
         size_dict = size
 
diff --git a/paddlemix/processors/image_transform_utils.py b/paddlemix/processors/image_transform_utils.py
index b11ca848774a0..4d3e9918931b3 100644
--- a/paddlemix/processors/image_transform_utils.py
+++ b/paddlemix/processors/image_transform_utils.py
@@ -23,10 +23,16 @@
 from paddle.vision.transforms import functional as F
 from PIL import Image
 
-from .image_utils import (ChannelDimension, ImageInput, PILImageResampling,
-                          TensorType, get_channel_dimension_axis,
-                          get_image_size, infer_channel_dimension_format,
-                          to_numpy_array)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    TensorType,
+    get_channel_dimension_axis,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+)
 from .utils import ExplicitEnum
 
 
@@ -35,9 +41,9 @@ def is_paddle_tensor(tensor):
 
 
 def to_channel_dimension_format(
-        image: np.ndarray,
-        channel_dim: Union[ChannelDimension, str],
-        input_channel_dim: Optional[Union[ChannelDimension, str]]=None,
+    image: np.ndarray,
+    channel_dim: Union[ChannelDimension, str],
+    input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
 ) -> np.ndarray:
     """
     Converts `image` to the channel dimension format specified by `channel_dim`.
@@ -52,8 +58,7 @@ def to_channel_dimension_format(
         `np.ndarray`: The image with the channel dimension set to `channel_dim`.
     """
     if not isinstance(image, np.ndarray):
-        raise ValueError(
-            f"Input image must be of type np.ndarray, got {type(image)}")
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
     if input_channel_dim is None:
         input_channel_dim = infer_channel_dimension_format(image)
@@ -67,17 +72,17 @@ def to_channel_dimension_format(
     elif target_channel_dim == ChannelDimension.LAST:
         image = image.transpose((1, 2, 0))
     else:
-        raise ValueError("Unsupported channel dimension format: {}".format(
-            channel_dim))
+        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
     return image
 
 
 def rescale(
-        image: np.ndarray,
-        scale: float,
-        data_format: Optional[ChannelDimension]=None,
-        dtype=np.float32, ) -> np.ndarray:
+    image: np.ndarray,
+    scale: float,
+    data_format: Optional[ChannelDimension] = None,
+    dtype=np.float32,
+) -> np.ndarray:
     """
     Rescales `image` by `scale`.
 
@@ -96,20 +101,19 @@ def rescale(
         `np.ndarray`: The rescaled image.
     """
     if not isinstance(image, np.ndarray):
-        raise ValueError(
-            f"Input image must be of type np.ndarray, got {type(image)}")
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
     rescaled_image = image * scale
     if data_format is not None:
-        rescaled_image = to_channel_dimension_format(rescaled_image,
-                                                     data_format)
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
     rescaled_image = rescaled_image.astype(dtype)
     return rescaled_image
 
 
 def to_pil_image(
-        image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
-        do_rescale: Optional[bool]=None, ) -> "PIL.Image.Image":
+    image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
+    do_rescale: Optional[bool] = None,
+) -> "PIL.Image.Image":
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
@@ -131,8 +135,7 @@ def to_pil_image(
     if is_paddle_tensor(image):
         image = image.numpy()
     elif not isinstance(image, np.ndarray):
-        raise ValueError("Input image type not supported: {}".format(
-            type(image)))
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     image = to_channel_dimension_format(image, ChannelDimension.LAST)
@@ -141,8 +144,7 @@ def to_pil_image(
     image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
-    do_rescale = (isinstance(image.flat[0], (float, np.float32, np.float64))
-                  if do_rescale is None else do_rescale)
+    do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale
     if do_rescale:
         image = rescale(image, 255)
     image = image.astype(np.uint8)
@@ -151,10 +153,11 @@ def to_pil_image(
 
 # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
 def get_resize_output_image_size(
-        input_image: np.ndarray,
-        size: Union[int, Tuple[int, int], List[int], Tuple[int]],
-        default_to_square: bool=True,
-        max_size: Optional[int]=None, ) -> tuple:
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    default_to_square: bool = True,
+    max_size: Optional[int] = None,
+) -> tuple:
     """
     Find the target (height, width) dimension of the output image after resizing given the input image and the desired
     size.
@@ -190,8 +193,7 @@ def get_resize_output_image_size(
             # Perform same logic as if size was an int
             size = size[0]
         else:
-            raise ValueError(
-                "size must have 1 or 2 elements if it is a list or tuple")
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
 
     if default_to_square:
         return (size, size)
@@ -200,14 +202,14 @@ def get_resize_output_image_size(
     short, long = (width, height) if width <= height else (height, width)
     requested_new_short = size
 
-    new_short, new_long = requested_new_short, int(requested_new_short * long /
-                                                   short)
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
 
     if max_size is not None:
         if max_size <= requested_new_short:
             raise ValueError(
                 f"max_size = {max_size} must be strictly greater than the requested "
-                f"size for the smaller edge size = {size}")
+                f"size for the smaller edge size = {size}"
+            )
         if new_long > max_size:
             new_short, new_long = int(max_size * new_short / new_long), max_size
 
@@ -215,12 +217,13 @@ def get_resize_output_image_size(
 
 
 def resize(
-        image,
-        size: Tuple[int, int],
-        resample: "PILImageResampling" =None,
-        reducing_gap: Optional[int]=None,
-        data_format: Optional[ChannelDimension]=None,
-        return_numpy: bool=True, ) -> np.ndarray:
+    image,
+    size: Tuple[int, int],
+    resample: "PILImageResampling" = None,
+    reducing_gap: Optional[int] = None,
+    data_format: Optional[ChannelDimension] = None,
+    return_numpy: bool = True,
+) -> np.ndarray:
     """
     Resizes `image` to `(height, width)` specified by `size` using the PIL library.
 
@@ -250,8 +253,7 @@ def resize(
 
     # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
     # The resized image from PIL will always have channels last, so find the input format first.
-    data_format = (infer_channel_dimension_format(image)
-                   if data_format is None else data_format)
+    data_format = infer_channel_dimension_format(image) if data_format is None else data_format
 
     # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
     # the pillow library to resize the image and then convert back to numpy
@@ -259,27 +261,26 @@ def resize(
         image = to_pil_image(image)
     height, width = size
     # PIL images are in the format (width, height)
-    resized_image = image.resize(
-        (width, height), resample=resample, reducing_gap=reducing_gap)
+    resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
 
     if return_numpy:
         resized_image = np.array(resized_image)
         # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
         # so we need to add it back if necessary.
-        resized_image = (np.expand_dims(
-            resized_image, axis=-1)
-                         if resized_image.ndim == 2 else resized_image)
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
         # The image is always in channels last format after converting from a PIL image
         resized_image = to_channel_dimension_format(
-            resized_image, data_format, input_channel_dim=ChannelDimension.LAST)
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
     return resized_image
 
 
 def normalize(
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension]=None, ) -> np.ndarray:
+    image: np.ndarray,
+    mean: Union[float, Iterable[float]],
+    std: Union[float, Iterable[float]],
+    data_format: Optional[ChannelDimension] = None,
+) -> np.ndarray:
     """
     Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
 
@@ -298,7 +299,8 @@ def normalize(
     if isinstance(image, PIL.Image.Image):
         warnings.warn(
             "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning, )
+            FutureWarning,
+        )
         # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
         # casting to numpy array and dividing by 255.
         image = to_numpy_array(image)
@@ -313,18 +315,14 @@ def normalize(
 
     if isinstance(mean, Iterable):
         if len(mean) != num_channels:
-            raise ValueError(
-                f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}"
-            )
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
     else:
         mean = [mean] * num_channels
     mean = np.array(mean, dtype=image.dtype)
 
     if isinstance(std, Iterable):
         if len(std) != num_channels:
-            raise ValueError(
-                f"std must have {num_channels} elements if it is an iterable, got {len(std)}"
-            )
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
     else:
         std = [std] * num_channels
     std = np.array(std, dtype=image.dtype)
@@ -334,16 +332,16 @@ def normalize(
     else:
         image = ((image.T - mean) / std).T
 
-    image = (to_channel_dimension_format(image, data_format)
-             if data_format is not None else image)
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
     return image
 
 
 def center_crop(
-        image: np.ndarray,
-        size: Tuple[int, int],
-        data_format: Optional[Union[str, ChannelDimension]]=None,
-        return_numpy: Optional[bool]=None, ) -> np.ndarray:
+    image: np.ndarray,
+    size: Tuple[int, int],
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    return_numpy: Optional[bool] = None,
+) -> np.ndarray:
     """
     Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
     the size given, it will be padded (so the returned result will always be of size `size`).
@@ -370,20 +368,18 @@ def center_crop(
     if isinstance(image, PIL.Image.Image):
         warnings.warn(
             "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning, )
+            FutureWarning,
+        )
         image = to_numpy_array(image)
         return_numpy = False if return_numpy is None else return_numpy
     else:
         return_numpy = True if return_numpy is None else return_numpy
 
     if not isinstance(image, np.ndarray):
-        raise ValueError(
-            f"Input image must be of type np.ndarray, got {type(image)}")
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
     if not isinstance(size, Iterable) or len(size) != 2:
-        raise ValueError(
-            "size must have 2 elements representing the height and width of the output image"
-        )
+        raise ValueError("size must have 2 elements representing the height and width of the output image")
 
     input_data_format = infer_channel_dimension_format(image)
     output_data_format = data_format if data_format is not None else input_data_format
@@ -426,8 +422,7 @@ def center_crop(
     left += left_pad
     right += left_pad
 
-    new_image = new_image[..., max(0, top):min(new_height, bottom), max(
-        0, left):min(new_width, right)]
+    new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
     new_image = to_channel_dimension_format(new_image, output_data_format)
 
     if not return_numpy:
@@ -436,8 +431,7 @@ def center_crop(
     return new_image
 
 
-def _center_to_corners_format_paddle(
-        bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
+def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
     center_x, center_y, width, height = bboxes_center.unbind(-1)
     bbox_corners = paddle.stack(
         # top left x, top left y, bottom right x, bottom right y
@@ -447,7 +441,8 @@ def _center_to_corners_format_paddle(
             (center_x + 0.5 * width),
             (center_y + 0.5 * height),
         ],
-        axis=-1, )
+        axis=-1,
+    )
     return bbox_corners
 
 
@@ -461,7 +456,8 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
             center_x + 0.5 * width,
             center_y + 0.5 * height,
         ],
-        axis=-1, )
+        axis=-1,
+    )
     return bboxes_corners
 
 
@@ -486,9 +482,9 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
 
 
 def _corners_to_center_format_paddle(
-        bboxes_corners: "paddle.Tensor", ) -> "paddle.Tensor":
-    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(
-        -1)
+    bboxes_corners: "paddle.Tensor",
+) -> "paddle.Tensor":
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
     b = [
         (top_left_x + bottom_right_x) / 2,  # center x
         (top_left_y + bottom_right_y) / 2,  # center y
@@ -507,7 +503,8 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
             (bottom_right_x - top_left_x),  # width
             (bottom_right_y - top_left_y),  # height
         ],
-        axis=-1, )
+        axis=-1,
+    )
     return bboxes_center
 
 
@@ -539,8 +536,7 @@ def rgb_to_id(color):
     if isinstance(color, np.ndarray) and len(color.shape) == 3:
         if color.dtype == np.uint8:
             color = color.astype(np.int32)
-        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :,
-                                                                         2]
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
 
@@ -575,12 +571,12 @@ class PaddingMode(ExplicitEnum):
 
 
 def pad(
-        image: np.ndarray,
-        padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
-        mode: PaddingMode=PaddingMode.CONSTANT,
-        constant_values: Union[float, Iterable[float]]=0.0,
-        data_format: Optional[Union[str, ChannelDimension]]=None,
-        input_data_format: Optional[Union[str, ChannelDimension]]=None,
+    image: np.ndarray,
+    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: Union[float, Iterable[float]] = 0.0,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
     """
     Pads the `image` with the specified (height, width) `padding` and `mode`.
@@ -628,19 +624,15 @@ def _expand_for_data_format(values):
             values = ((values, values), (values, values))
         elif isinstance(values, tuple) and len(values) == 1:
             values = ((values[0], values[0]), (values[0], values[0]))
-        elif (isinstance(values, tuple) and len(values) == 2 and
-              isinstance(values[0], int)):
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
             values = (values, values)
-        elif (isinstance(values, tuple) and len(values) == 2 and
-              isinstance(values[0], tuple)):
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
             values = values
         else:
             raise ValueError(f"Unsupported format: {values}")
 
         # add 0 for channel dimension
-        values = (((0, 0), *values)
-                  if input_data_format == ChannelDimension.FIRST else (*values,
-                                                                       (0, 0)))
+        values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
 
         # Add additional padding if there's a batch dimension
         values = (0, *values) if image.ndim == 4 else values
@@ -650,10 +642,7 @@ def _expand_for_data_format(values):
 
     if mode == PaddingMode.CONSTANT:
         constant_values = _expand_for_data_format(constant_values)
-        image = np.pad(image,
-                       padding,
-                       mode="constant",
-                       constant_values=constant_values)
+        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
     elif mode == PaddingMode.REFLECT:
         image = np.pad(image, padding, mode="reflect")
     elif mode == PaddingMode.REPLICATE:
@@ -663,8 +652,7 @@ def _expand_for_data_format(values):
     else:
         raise ValueError(f"Invalid padding mode: {mode}")
 
-    image = (to_channel_dimension_format(image, data_format)
-             if data_format is not None else image)
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
     return image
 
 
@@ -697,8 +685,9 @@ def decode_image(image_path: str) -> ImageInput:
 
 
 def random_horizontal_flip(
-        image: np.ndarray,
-        flip_prob: float, ) -> np.ndarray:
+    image: np.ndarray,
+    flip_prob: float,
+) -> np.ndarray:
     """
     Randomly flips the image horizontally.
 
@@ -757,19 +746,18 @@ def _get_image_size(img):
         elif len(img.shape) == 4:
             return img.shape[2:][::-1]  # nchw -> wh
         else:
-            raise ValueError(
-                "The dim for input Tensor should be 3-D or 4-D, but received {}".
-                format(len(img.shape)))
+            raise ValueError("The dim for input Tensor should be 3-D or 4-D, but received {}".format(len(img.shape)))
     else:
         raise TypeError(f"Unexpected type {type(img)}")
 
 
 def random_resized_crop(
-        image: np.ndarray,
-        size: Union[int, List, Tuple],
-        scale: float=(0.08, 1.0),
-        ratio: float=(3.0 / 4, 4.0 / 3),
-        resample: "PILImageResampling" =None, ) -> np.ndarray:
+    image: np.ndarray,
+    size: Union[int, List, Tuple],
+    scale: float = (0.08, 1.0),
+    ratio: float = (3.0 / 4, 4.0 / 3),
+    resample: "PILImageResampling" = None,
+) -> np.ndarray:
     """
     Crop the input data to random size and aspect ratio.
     A crop of random size (default: of 0.08 to 1.0) of the original size and a random
diff --git a/paddlemix/processors/image_transforms.py b/paddlemix/processors/image_transforms.py
index f8e07441533b5..135d42e0ac095 100644
--- a/paddlemix/processors/image_transforms.py
+++ b/paddlemix/processors/image_transforms.py
@@ -20,12 +20,17 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers.tokenizer_utils_base import (ExplicitEnum,
-                                                         TensorType)
+from paddlenlp.transformers.tokenizer_utils_base import ExplicitEnum, TensorType
 
-from .image_utils import (ChannelDimension, ImageInput, PILImageResampling,
-                          get_channel_dimension_axis, get_image_size,
-                          infer_channel_dimension_format, to_numpy_array)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_channel_dimension_axis,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+)
 
 
 def is_paddle_tensor(tensor):
@@ -33,9 +38,9 @@ def is_paddle_tensor(tensor):
 
 
 def to_channel_dimension_format(
-        image: np.ndarray,
-        channel_dim: Union[ChannelDimension, str],
-        input_channel_dim: Optional[Union[ChannelDimension, str]]=None,
+    image: np.ndarray,
+    channel_dim: Union[ChannelDimension, str],
+    input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
 ) -> np.ndarray:
     """
     Converts `image` to the channel dimension format specified by `channel_dim`.
@@ -50,8 +55,7 @@ def to_channel_dimension_format(
         `np.ndarray`: The image with the channel dimension set to `channel_dim`.
     """
     if not isinstance(image, np.ndarray):
-        raise ValueError(
-            f"Input image must be of type np.ndarray, got {type(image)}")
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
     if input_channel_dim is None:
         input_channel_dim = infer_channel_dimension_format(image)
@@ -65,17 +69,17 @@ def to_channel_dimension_format(
     elif target_channel_dim == ChannelDimension.LAST:
         image = image.transpose((1, 2, 0))
     else:
-        raise ValueError("Unsupported channel dimension format: {}".format(
-            channel_dim))
+        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
     return image
 
 
 def rescale(
-        image: np.ndarray,
-        scale: float,
-        data_format: Optional[ChannelDimension]=None,
-        dtype=np.float32, ) -> np.ndarray:
+    image: np.ndarray,
+    scale: float,
+    data_format: Optional[ChannelDimension] = None,
+    dtype=np.float32,
+) -> np.ndarray:
     """
     Rescales `image` by `scale`.
 
@@ -94,20 +98,19 @@ def rescale(
         `np.ndarray`: The rescaled image.
     """
     if not isinstance(image, np.ndarray):
-        raise ValueError(
-            f"Input image must be of type np.ndarray, got {type(image)}")
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
     rescaled_image = image * scale
     if data_format is not None:
-        rescaled_image = to_channel_dimension_format(rescaled_image,
-                                                     data_format)
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
     rescaled_image = rescaled_image.astype(dtype)
     return rescaled_image
 
 
 def to_pil_image(
-        image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
-        do_rescale: Optional[bool]=None, ) -> "PIL.Image.Image":
+    image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
+    do_rescale: Optional[bool] = None,
+) -> "PIL.Image.Image":
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
@@ -129,8 +132,7 @@ def to_pil_image(
     if is_paddle_tensor(image):
         image = image.numpy()
     elif not isinstance(image, np.ndarray):
-        raise ValueError("Input image type not supported: {}".format(
-            type(image)))
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     image = to_channel_dimension_format(image, ChannelDimension.LAST)
@@ -139,8 +141,7 @@ def to_pil_image(
     image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
-    do_rescale = (isinstance(image.flat[0], (float, np.float32, np.float64))
-                  if do_rescale is None else do_rescale)
+    do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale
     if do_rescale:
         image = rescale(image, 255)
     image = image.astype(np.uint8)
@@ -149,10 +150,11 @@ def to_pil_image(
 
 # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
 def get_resize_output_image_size(
-        input_image: np.ndarray,
-        size: Union[int, Tuple[int, int], List[int], Tuple[int]],
-        default_to_square: bool=True,
-        max_size: Optional[int]=None, ) -> tuple:
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    default_to_square: bool = True,
+    max_size: Optional[int] = None,
+) -> tuple:
     """
     Find the target (height, width) dimension of the output image after resizing given the input image and the desired
     size.
@@ -188,8 +190,7 @@ def get_resize_output_image_size(
             # Perform same logic as if size was an int
             size = size[0]
         else:
-            raise ValueError(
-                "size must have 1 or 2 elements if it is a list or tuple")
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
 
     if default_to_square:
         return (size, size)
@@ -198,14 +199,14 @@ def get_resize_output_image_size(
     short, long = (width, height) if width <= height else (height, width)
     requested_new_short = size
 
-    new_short, new_long = requested_new_short, int(requested_new_short * long /
-                                                   short)
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
 
     if max_size is not None:
         if max_size <= requested_new_short:
             raise ValueError(
                 f"max_size = {max_size} must be strictly greater than the requested "
-                f"size for the smaller edge size = {size}")
+                f"size for the smaller edge size = {size}"
+            )
         if new_long > max_size:
             new_short, new_long = int(max_size * new_short / new_long), max_size
 
@@ -213,12 +214,13 @@ def get_resize_output_image_size(
 
 
 def resize(
-        image,
-        size: Tuple[int, int],
-        resample: "PILImageResampling" =None,
-        reducing_gap: Optional[int]=None,
-        data_format: Optional[ChannelDimension]=None,
-        return_numpy: bool=True, ) -> np.ndarray:
+    image,
+    size: Tuple[int, int],
+    resample: "PILImageResampling" = None,
+    reducing_gap: Optional[int] = None,
+    data_format: Optional[ChannelDimension] = None,
+    return_numpy: bool = True,
+) -> np.ndarray:
     """
     Resizes `image` to `(height, width)` specified by `size` using the PIL library.
 
@@ -248,8 +250,7 @@ def resize(
 
     # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
     # The resized image from PIL will always have channels last, so find the input format first.
-    data_format = (infer_channel_dimension_format(image)
-                   if data_format is None else data_format)
+    data_format = infer_channel_dimension_format(image) if data_format is None else data_format
 
     # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
     # the pillow library to resize the image and then convert back to numpy
@@ -257,27 +258,26 @@ def resize(
         image = to_pil_image(image)
     height, width = size
     # PIL images are in the format (width, height)
-    resized_image = image.resize(
-        (width, height), resample=resample, reducing_gap=reducing_gap)
+    resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
 
     if return_numpy:
         resized_image = np.array(resized_image)
         # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
         # so we need to add it back if necessary.
-        resized_image = (np.expand_dims(
-            resized_image, axis=-1)
-                         if resized_image.ndim == 2 else resized_image)
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
         # The image is always in channels last format after converting from a PIL image
         resized_image = to_channel_dimension_format(
-            resized_image, data_format, input_channel_dim=ChannelDimension.LAST)
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
     return resized_image
 
 
 def normalize(
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension]=None, ) -> np.ndarray:
+    image: np.ndarray,
+    mean: Union[float, Iterable[float]],
+    std: Union[float, Iterable[float]],
+    data_format: Optional[ChannelDimension] = None,
+) -> np.ndarray:
     """
     Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
 
@@ -296,7 +296,8 @@ def normalize(
     if isinstance(image, PIL.Image.Image):
         warnings.warn(
             "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning, )
+            FutureWarning,
+        )
         # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
         # casting to numpy array and dividing by 255.
         image = to_numpy_array(image)
@@ -311,18 +312,14 @@ def normalize(
 
     if isinstance(mean, Iterable):
         if len(mean) != num_channels:
-            raise ValueError(
-                f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}"
-            )
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
     else:
         mean = [mean] * num_channels
     mean = np.array(mean, dtype=image.dtype)
 
     if isinstance(std, Iterable):
         if len(std) != num_channels:
-            raise ValueError(
-                f"std must have {num_channels} elements if it is an iterable, got {len(std)}"
-            )
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
     else:
         std = [std] * num_channels
     std = np.array(std, dtype=image.dtype)
@@ -332,16 +329,16 @@ def normalize(
     else:
         image = ((image.T - mean) / std).T
 
-    image = (to_channel_dimension_format(image, data_format)
-             if data_format is not None else image)
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
     return image
 
 
 def center_crop(
-        image: np.ndarray,
-        size: Tuple[int, int],
-        data_format: Optional[Union[str, ChannelDimension]]=None,
-        return_numpy: Optional[bool]=None, ) -> np.ndarray:
+    image: np.ndarray,
+    size: Tuple[int, int],
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    return_numpy: Optional[bool] = None,
+) -> np.ndarray:
     """
     Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
     the size given, it will be padded (so the returned result will always be of size `size`).
@@ -368,20 +365,18 @@ def center_crop(
     if isinstance(image, PIL.Image.Image):
         warnings.warn(
             "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning, )
+            FutureWarning,
+        )
         image = to_numpy_array(image)
         return_numpy = False if return_numpy is None else return_numpy
     else:
         return_numpy = True if return_numpy is None else return_numpy
 
     if not isinstance(image, np.ndarray):
-        raise ValueError(
-            f"Input image must be of type np.ndarray, got {type(image)}")
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
     if not isinstance(size, Iterable) or len(size) != 2:
-        raise ValueError(
-            "size must have 2 elements representing the height and width of the output image"
-        )
+        raise ValueError("size must have 2 elements representing the height and width of the output image")
 
     input_data_format = infer_channel_dimension_format(image)
     output_data_format = data_format if data_format is not None else input_data_format
@@ -424,8 +419,7 @@ def center_crop(
     left += left_pad
     right += left_pad
 
-    new_image = new_image[..., max(0, top):min(new_height, bottom), max(
-        0, left):min(new_width, right)]
+    new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
     new_image = to_channel_dimension_format(new_image, output_data_format)
 
     if not return_numpy:
@@ -434,8 +428,7 @@ def center_crop(
     return new_image
 
 
-def _center_to_corners_format_paddle(
-        bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
+def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
     center_x, center_y, width, height = bboxes_center.unbind(-1)
     bbox_corners = paddle.stack(
         # top left x, top left y, bottom right x, bottom right y
@@ -445,7 +438,8 @@ def _center_to_corners_format_paddle(
             (center_x + 0.5 * width),
             (center_y + 0.5 * height),
         ],
-        axis=-1, )
+        axis=-1,
+    )
     return bbox_corners
 
 
@@ -459,7 +453,8 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
             center_x + 0.5 * width,
             center_y + 0.5 * height,
         ],
-        axis=-1, )
+        axis=-1,
+    )
     return bboxes_corners
 
 
@@ -484,9 +479,9 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
 
 
 def _corners_to_center_format_paddle(
-        bboxes_corners: "paddle.Tensor", ) -> "paddle.Tensor":
-    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(
-        -1)
+    bboxes_corners: "paddle.Tensor",
+) -> "paddle.Tensor":
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
     b = [
         (top_left_x + bottom_right_x) / 2,  # center x
         (top_left_y + bottom_right_y) / 2,  # center y
@@ -505,7 +500,8 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
             (bottom_right_x - top_left_x),  # width
             (bottom_right_y - top_left_y),  # height
         ],
-        axis=-1, )
+        axis=-1,
+    )
     return bboxes_center
 
 
@@ -537,8 +533,7 @@ def rgb_to_id(color):
     if isinstance(color, np.ndarray) and len(color.shape) == 3:
         if color.dtype == np.uint8:
             color = color.astype(np.int32)
-        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :,
-                                                                         2]
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
 
@@ -573,12 +568,12 @@ class PaddingMode(ExplicitEnum):
 
 
 def pad(
-        image: np.ndarray,
-        padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
-        mode: PaddingMode=PaddingMode.CONSTANT,
-        constant_values: Union[float, Iterable[float]]=0.0,
-        data_format: Optional[Union[str, ChannelDimension]]=None,
-        input_data_format: Optional[Union[str, ChannelDimension]]=None,
+    image: np.ndarray,
+    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: Union[float, Iterable[float]] = 0.0,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
     """
     Pads the `image` with the specified (height, width) `padding` and `mode`.
@@ -626,19 +621,15 @@ def _expand_for_data_format(values):
             values = ((values, values), (values, values))
         elif isinstance(values, tuple) and len(values) == 1:
             values = ((values[0], values[0]), (values[0], values[0]))
-        elif (isinstance(values, tuple) and len(values) == 2 and
-              isinstance(values[0], int)):
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
             values = (values, values)
-        elif (isinstance(values, tuple) and len(values) == 2 and
-              isinstance(values[0], tuple)):
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
             values = values
         else:
             raise ValueError(f"Unsupported format: {values}")
 
         # add 0 for channel dimension
-        values = (((0, 0), *values)
-                  if input_data_format == ChannelDimension.FIRST else (*values,
-                                                                       (0, 0)))
+        values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
 
         # Add additional padding if there's a batch dimension
         values = (0, *values) if image.ndim == 4 else values
@@ -648,10 +639,7 @@ def _expand_for_data_format(values):
 
     if mode == PaddingMode.CONSTANT:
         constant_values = _expand_for_data_format(constant_values)
-        image = np.pad(image,
-                       padding,
-                       mode="constant",
-                       constant_values=constant_values)
+        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
     elif mode == PaddingMode.REFLECT:
         image = np.pad(image, padding, mode="reflect")
     elif mode == PaddingMode.REPLICATE:
@@ -661,8 +649,7 @@ def _expand_for_data_format(values):
     else:
         raise ValueError(f"Invalid padding mode: {mode}")
 
-    image = (to_channel_dimension_format(image, data_format)
-             if data_format is not None else image)
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
     return image
 
 
diff --git a/paddlemix/processors/image_utils.py b/paddlemix/processors/image_utils.py
index a9ac6fca3f62d..8f59dd153e395 100644
--- a/paddlemix/processors/image_utils.py
+++ b/paddlemix/processors/image_utils.py
@@ -49,15 +49,19 @@ def to_numpy(obj):
         return obj
 
 
-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse(
-        "9.1.0"):
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PILImageResampling = PIL.Image.Resampling
 else:
     PILImageResampling = PIL.Image
 
-ImageInput = Union["PIL.Image.Image", np.ndarray, "paddle.Tensor",
-                   List["PIL.Image.Image"], List[np.ndarray],
-                   List["paddle.Tensor"], ]  # noqa
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "paddle.Tensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["paddle.Tensor"],
+]  # noqa
 
 
 class TensorType(ExplicitEnum):
@@ -76,8 +80,7 @@ class ChannelDimension(ExplicitEnum):
 
 
 def is_valid_image(img):
-    return (isinstance(img, PIL.Image.Image) or isinstance(img, np.ndarray) or
-            is_paddle_tensor(img))
+    return isinstance(img, PIL.Image.Image) or isinstance(img, np.ndarray) or is_paddle_tensor(img)
 
 
 def valid_images(imgs):
@@ -98,7 +101,7 @@ def is_batched(img):
     return False
 
 
-def make_list_of_images(images, expected_ndims: int=3) -> List[ImageInput]:
+def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
     """
     Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
     If the input is a batch of images, it is converted to a list of images.
@@ -127,11 +130,12 @@ def make_list_of_images(images, expected_ndims: int=3) -> List[ImageInput]:
         else:
             raise ValueError(
                 f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
-                f" {images.ndim} dimensions.")
+                f" {images.ndim} dimensions."
+            )
         return images
     raise ValueError(
-        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor "
-        f"but got {type(images)}.")
+        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor " f"but got {type(images)}."
+    )
 
 
 def to_numpy_array(img) -> np.ndarray:
@@ -159,8 +163,7 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     elif image.ndim == 4:
         first_dim, last_dim = 1, 3
     else:
-        raise ValueError(
-            f"Unsupported number of image dimensions: {image.ndim}")
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
 
     if image.shape[first_dim] in (1, 3):
         return ChannelDimension.FIRST
@@ -188,8 +191,7 @@ def get_channel_dimension_axis(image: np.ndarray) -> int:
     raise ValueError(f"Unsupported data format: {channel_dim}")
 
 
-def get_image_size(image: np.ndarray,
-                   channel_dim: ChannelDimension=None) -> Tuple[int, int]:
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
     """
     Returns the (height, width) dimensions of the image.
 
@@ -213,37 +215,44 @@ def get_image_size(image: np.ndarray,
         raise ValueError(f"Unsupported data format: {channel_dim}")
 
 
-def is_valid_annotation_coco_detection(
-        annotation: Dict[str, Union[List, Tuple]]) -> bool:
-    if (isinstance(annotation, dict) and "image_id" in annotation and
-            "annotations" in annotation and
-            isinstance(annotation["annotations"], (list, tuple)) and (
-                # an image can have no annotations
-                len(annotation["annotations"]) == 0 or
-                isinstance(annotation["annotations"][0], dict))):
+def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "annotations" in annotation
+        and isinstance(annotation["annotations"], (list, tuple))
+        and (
+            # an image can have no annotations
+            len(annotation["annotations"]) == 0
+            or isinstance(annotation["annotations"][0], dict)
+        )
+    ):
         return True
     return False
 
 
-def is_valid_annotation_coco_panoptic(
-        annotation: Dict[str, Union[List, Tuple]]) -> bool:
-    if (isinstance(annotation, dict) and "image_id" in annotation and
-            "segments_info" in annotation and "file_name" in annotation and
-            isinstance(annotation["segments_info"], (list, tuple)) and (
-                # an image can have no segments
-                len(annotation["segments_info"]) == 0 or
-                isinstance(annotation["segments_info"][0], dict))):
+def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "segments_info" in annotation
+        and "file_name" in annotation
+        and isinstance(annotation["segments_info"], (list, tuple))
+        and (
+            # an image can have no segments
+            len(annotation["segments_info"]) == 0
+            or isinstance(annotation["segments_info"][0], dict)
+        )
+    ):
         return True
     return False
 
 
-def valid_coco_detection_annotations(
-        annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
     return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
 
 
-def valid_coco_panoptic_annotations(
-        annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
     return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
 
 
@@ -280,8 +289,7 @@ def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
     return image
 
 
-def get_preprocess_shape(oldh: int, oldw: int,
-                         long_side_length: int) -> Tuple[int, int]:
+def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
     """
     Compute the output size given input size and target long side length.
     """
diff --git a/paddlemix/processors/imagebind_processing.py b/paddlemix/processors/imagebind_processing.py
index 1b70eed72629f..ff69dca3e927d 100644
--- a/paddlemix/processors/imagebind_processing.py
+++ b/paddlemix/processors/imagebind_processing.py
@@ -17,19 +17,14 @@
 """
 
 import logging
-from typing import Dict, List, Optional, Union
+from typing import List, Optional, Union
 
-import numpy as np
 import paddle
 from paddle.vision.transforms import transforms as T
-from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding,
-                                                         TensorType, TextInput)
+from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding
 from paddlevideo.data.clip_sampling import ConstantClipsPerVideoSampler
-from PIL import Image
 
 from .base_processing import ProcessorMixin
-from .image_processing_utils import BatchFeature
-from .image_utils import ImageInput
 from .processing_utils import BaseAudioProcessor
 
 __all__ = ["ImageBindProcessor", "ImageBindAudioProcessor"]
@@ -48,34 +43,23 @@ class ImageBindProcessor(ProcessorMixin):
     def __init__(self, image_processor, tokenizer, audio_processor):
         super().__init__(image_processor, tokenizer, audio_processor)
 
-    def __call__(self,
-                 text=None,
-                 images=None,
-                 audios=None,
-                 return_tensors=None,
-                 **kwargs):
+    def __call__(self, text=None, images=None, audios=None, return_tensors=None, **kwargs):
 
         if text is None and images is None:
-            raise ValueError(
-                "You have to specify either text or images. Both cannot be none."
-            )
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
 
         if text is not None:
-            encoding = self.tokenizer(
-                text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
             n, m = encoding["input_ids"].shape
-            zero_encoding = paddle.zeros(
-                shape=[n, self.tokenizer.max_len], dtype="int64")
+            zero_encoding = paddle.zeros(shape=[n, self.tokenizer.max_len], dtype="int64")
             zero_encoding[:, :m] = paddle.to_tensor(data=encoding["input_ids"])
             encoding["input_ids"] = zero_encoding
 
         if images is not None:
-            image_features = self.image_processor(
-                images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
 
         if audios is not None:
-            encoding["audio_values"] = self.audio_processor(
-                audios, return_tensors=return_tensors, **kwargs)
+            encoding["audio_values"] = self.audio_processor(audios, return_tensors=return_tensors, **kwargs)
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
@@ -83,30 +67,29 @@ def __call__(self,
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(
-                data=dict(**image_features), tensor_type=return_tensors)
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
 
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
 class ImageBindAudioProcessor(BaseAudioProcessor):
     model_input_names = ["audio_values"]
 
     def __init__(
-            self,
-            num_mel_bins: int=0,
-            target_length: int=0,
-            sample_rate: int=0,
-            clip_duration: int=0,
-            clips_per_video: int=0,
-            mean: Optional[Union[float, List[float]]]=None,
-            std: Optional[Union[float, List[float]]]=None,
-            **kwargs, ):
+        self,
+        num_mel_bins: int = 0,
+        target_length: int = 0,
+        sample_rate: int = 0,
+        clip_duration: int = 0,
+        clips_per_video: int = 0,
+        mean: Optional[Union[float, List[float]]] = None,
+        std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self.num_mel_bins = num_mel_bins
         self.target_length = target_length
@@ -117,9 +100,10 @@ def __init__(
         self.std = std
 
     def preprocess(
-            self,
-            audio_path: Union[str, List[str]],
-            **kwargs, ):
+        self,
+        audio_path: Union[str, List[str]],
+        **kwargs,
+    ):
         """
         Preprocess the text with tokenization.
         """
@@ -128,32 +112,38 @@ def preprocess(
         audio_outputs = []
         # breakpoint()
         clip_sampler = ConstantClipsPerVideoSampler(
-            clip_duration=self.clip_duration,
-            clips_per_video=self.clips_per_video)
+            clip_duration=self.clip_duration, clips_per_video=self.clips_per_video
+        )
         # for audio_path in audio_paths:
         waveform, sr = paddle.audio.load(audio_path)
         if self.sample_rate != sr:
-            waveform = paddle.audio.functional.resample(
-                waveform, orig_freq=sr, new_freq=self.sample_rate)
+            waveform = paddle.audio.functional.resample(waveform, orig_freq=sr, new_freq=self.sample_rate)
 
-        all_clips_timepoints = self.get_clip_timepoints(
-            clip_sampler, waveform.shape[1] / self.sample_rate)
+        all_clips_timepoints = self.get_clip_timepoints(clip_sampler, waveform.shape[1] / self.sample_rate)
         all_clips = []
 
         for clip_timepoints in all_clips_timepoints:
-            waveform_clip = waveform[:, int(clip_timepoints[
-                0] * self.sample_rate):int(clip_timepoints[1] *
-                                           self.sample_rate), ]
+            waveform_clip = waveform[
+                :,
+                int(clip_timepoints[0] * self.sample_rate) : int(clip_timepoints[1] * self.sample_rate),
+            ]
             waveform_melspec = self.waveform2melspec(
-                waveform_clip, self.sample_rate, self.num_mel_bins,
-                self.target_length)
+                waveform_clip, self.sample_rate, self.num_mel_bins, self.target_length
+            )
             all_clips.append(waveform_melspec)
 
             normalize = T.Normalize(
                 mean=self.mean
-                if not isinstance(self.mean, (float, int)) else [self.mean, ],
+                if not isinstance(self.mean, (float, int))
+                else [
+                    self.mean,
+                ],
                 std=self.std
-                if not isinstance(self.std, (float, int)) else [self.std, ], )
+                if not isinstance(self.std, (float, int))
+                else [
+                    self.std,
+                ],
+            )
 
         all_clips = [normalize(ac) for ac in all_clips]
         all_clips = paddle.stack(x=all_clips, axis=0)
@@ -165,13 +155,11 @@ def get_clip_timepoints(self, clip_sampler, duration):
         is_last_clip = False
         end = 0.0
         while not is_last_clip:
-            start, end, _, _, is_last_clip = clip_sampler(
-                end, duration, annotation=None)
+            start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
             all_clips_timepoints.append((start, end))
         return all_clips_timepoints
 
-    def waveform2melspec(self, waveform, sample_rate, num_mel_bins,
-                         target_length):
+    def waveform2melspec(self, waveform, sample_rate, num_mel_bins, target_length):
         waveform -= waveform.mean()
         fbank = paddle.audio.fbank(
             waveform,
@@ -182,7 +170,8 @@ def waveform2melspec(self, waveform, sample_rate, num_mel_bins,
             num_mel_bins=num_mel_bins,
             dither=0.0,
             frame_length=25,
-            frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS, )
+            frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
+        )
         x = fbank
         perm_0 = list(range(x.ndim))
         perm_0[0] = 1
@@ -194,10 +183,10 @@ def waveform2melspec(self, waveform, sample_rate, num_mel_bins,
             logging.warning(
                 "Large gap between audio n_frames(%d) and target_length (%d). Is the audio_target_length setting correct?",
                 n_frames,
-                target_length, )
+                target_length,
+            )
         if p > 0:
-            fbank = paddle.pad_from_torch(
-                fbank, pad=(0, p), mode="constant", value=0)
+            fbank = paddle.pad_from_torch(fbank, pad=(0, p), mode="constant", value=0)
         elif p < 0:
             fbank = fbank[:, 0:target_length]
 
diff --git a/paddlemix/processors/minigpt4_image_processing.py b/paddlemix/processors/minigpt4_image_processing.py
index 0558e3124c4fb..1798f95574a4d 100644
--- a/paddlemix/processors/minigpt4_image_processing.py
+++ b/paddlemix/processors/minigpt4_image_processing.py
@@ -21,14 +21,26 @@
 import PIL
 from paddlenlp.transformers.tokenizer_utils_base import TensorType
 
-from .image_processing_utils import (BaseImageProcessor, BatchFeature,
-                                     get_size_dict)
-from .image_transforms import (convert_to_rgb, normalize, rescale, resize,
-                               to_channel_dimension_format)
-from .image_utils import (ChannelDimension, ImageInput, PILImageResampling,
-                          is_batched, to_numpy_array, valid_images)
-
-__all__ = ["MiniGPT4ImageProcessor", ]
+from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from .image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+
+__all__ = [
+    "MiniGPT4ImageProcessor",
+]
 
 
 class MiniGPT4ImageProcessor(BaseImageProcessor):
@@ -69,17 +81,18 @@ class MiniGPT4ImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool=True,
-            size: Dict[str, int]=None,
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            do_rescale: bool=True,
-            rescale_factor: Union[int, float]=1 / 255,
-            do_normalize: bool=True,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            do_convert_rgb: bool=True,
-            **kwargs, ) -> None:
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
         super().__init__(**kwargs)
         default_image_mean = [0.48145466, 0.4578275, 0.40821073]
         default_image_std = [0.26862954, 0.26130258, 0.27577711]
@@ -97,12 +110,13 @@ def __init__(
         self.do_convert_rgb = do_convert_rgb
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Resize an image.
 
@@ -127,14 +141,16 @@ def resize(
             size=output_size,
             resample=resample,
             data_format=data_format,
-            **kwargs, )
+            **kwargs,
+        )
 
     def rescale(
-            self,
-            image: np.ndarray,
-            scale: Union[int, float],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ):
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
         """
         Rescale an image by a scale factor. image = image * scale.
 
@@ -149,12 +165,13 @@ def rescale(
         return rescale(image, scale=scale, data_format=data_format, **kwargs)
 
     def normalize(
-            self,
-            image: np.ndarray,
-            mean: Union[float, List[float]],
-            std: Union[float, List[float]],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
 
@@ -168,24 +185,24 @@ def normalize(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        return normalize(
-            image, mean=mean, std=std, data_format=data_format, **kwargs)
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            do_resize: Optional[bool]=None,
-            size: Optional[Dict[str, int]]=None,
-            resample: PILImageResampling=None,
-            do_rescale: Optional[bool]=None,
-            rescale_factor: Optional[float]=None,
-            do_normalize: Optional[bool]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=None,
-            do_convert_rgb: bool=None,
-            data_format: ChannelDimension=ChannelDimension.FIRST,
-            **kwargs, ) -> PIL.Image.Image:
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
@@ -227,13 +244,11 @@ def preprocess(
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (rescale_factor if rescale_factor is not None else
-                          self.rescale_factor)
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else
-                          self.do_convert_rgb)
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
@@ -242,21 +257,16 @@ def preprocess(
             images = [images]
 
         if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "paddle.Tensor.")
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
         if do_resize and size is None or resample is None:
-            raise ValueError(
-                "Size and resample must be specified if do_resize is True.")
+            raise ValueError("Size and resample must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
-            raise ValueError(
-                "Rescale factor must be specified if do_rescale is True.")
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
 
         if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError(
-                "Image mean and std must be specified if do_normalize is True.")
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
         # PIL RGBA images are converted to RGB
         if do_convert_rgb:
@@ -266,28 +276,15 @@ def preprocess(
         images = [to_numpy_array(image) for image in images]
 
         if do_resize:
-            images = [
-                self.resize(
-                    image=image, size=size, resample=resample)
-                for image in images
-            ]
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
 
         if do_rescale:
-            images = [
-                self.rescale(
-                    image=image, scale=rescale_factor) for image in images
-            ]
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
 
         if do_normalize:
-            images = [
-                self.normalize(
-                    image=image, mean=image_mean, std=image_std)
-                for image in images
-            ]
-
-        images = [
-            to_channel_dimension_format(image, data_format) for image in images
-        ]
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/paddlemix/processors/minigpt4_processing.py b/paddlemix/processors/minigpt4_processing.py
index 72d71e7af1d34..3a79ceb64575f 100644
--- a/paddlemix/processors/minigpt4_processing.py
+++ b/paddlemix/processors/minigpt4_processing.py
@@ -20,15 +20,20 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding,
-                                                         TensorType, TextInput)
+from paddlenlp.transformers.tokenizer_utils_base import (
+    BatchEncoding,
+    TensorType,
+    TextInput,
+)
 from PIL import Image
 
 from .base_processing import ProcessorMixin
 from .image_processing_utils import BatchFeature
 from .image_utils import ImageInput
 
-__all__ = ["MiniGPT4Processor", ]
+__all__ = [
+    "MiniGPT4Processor",
+]
 
 
 class MiniGPT4Processor(ProcessorMixin):
@@ -74,16 +79,16 @@ def __init__(self, image_processor, tokenizer):
         tokenizer.pad_token = tokenizer.eos_token
         super().__init__(image_processor, tokenizer)
         self.current_processor = self.image_processor
-        self.default_prompt = (
-            "###Human: <Img><ImageHere></Img> <TextHere>###Assistant: ")
+        self.default_prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant: "
         self.image_tag = "<ImageHere>"
         self.text_tag = "<TextHere>"
 
     def process_images(
-            self,
-            images: ImageInput,
-            return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE,
-            **kwargs, ) -> BatchFeature:
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
         """
         This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model.
         Please refer to the docstring of the method for more information.
@@ -95,34 +100,31 @@ def process_images(
             images = [images]
 
         # processing with image processor
-        processed_images = self.image_processor(
-            images, return_tensors=return_tensors)
+        processed_images = self.image_processor(images, return_tensors=return_tensors)
 
         return processed_images
 
     def process_texts(
-            self,
-            texts: Union[TextInput, List[TextInput]],
-            prompts: Union[TextInput, List[TextInput]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE,
-            **kwargs, ):
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        prompts: Union[TextInput, List[TextInput]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ):
         prompts = prompts if prompts is not None else [self.default_prompt]
 
         if (not isinstance(texts, TextInput)) and (not isinstance(texts, list)):
+            raise TypeError("Unsupported type for texts: {}, only str and list type supported.".format(type(texts)))
+        if prompts is not None and (not isinstance(prompts, TextInput)) and (not isinstance(prompts, list)):
             raise TypeError(
-                "Unsupported type for texts: {}, only str and list type supported.".
-                format(type(texts)))
-        if (prompts is not None and (not isinstance(prompts, TextInput)) and
-            (not isinstance(prompts, list))):
-            raise TypeError(
-                "Unsupported type for prompts: {}, only str and list type supported.".
-                format(type(prompts)))
+                "Unsupported type for prompts: {}, only str and list type supported.".format(type(prompts))
+            )
 
         if isinstance(prompts, list):
             if isinstance(texts, list) and len(prompts) != len(texts):
                 raise ValueError(
-                    "The length of prompts not is equal to texts' length: {} != {}".
-                    format(len(prompts), len(texts)))
+                    "The length of prompts not is equal to texts' length: {} != {}".format(len(prompts), len(texts))
+                )
             elif isinstance(texts, TextInput):
                 texts = [texts] * len(prompts)
         else:
@@ -137,47 +139,51 @@ def process_texts(
             if self.image_tag not in text:
                 if self.image_tag not in prompt:
                     raise ValueError(
-                        "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.".
-                        format(self.image_tag, self.image_tag))
+                        "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.".format(
+                            self.image_tag, self.image_tag
+                        )
+                    )
                 if self.text_tag not in prompt:
                     raise ValueError(
-                        "A prompt should contain a text tag `{}` to insert text information.".
-                        format(self.text_tag))
+                        "A prompt should contain a text tag `{}` to insert text information.".format(self.text_tag)
+                    )
                 assemble_texts.append(prompt.replace(self.text_tag, text))
             else:
                 assemble_texts.append(text)
 
         # processing with text tokenizer
-        first_texts, second_texts = zip(* [
-            assemble_text.split(self.image_tag)
-            for assemble_text in assemble_texts
-        ])
+        first_texts, second_texts = zip(*[assemble_text.split(self.image_tag) for assemble_text in assemble_texts])
         first_text_encoding = self.tokenizer(
             text=first_texts,
             return_tensors=return_tensors,
             add_special_tokens=True,
-            **kwargs, )
+            **kwargs,
+        )
         second_text_encoding = self.tokenizer(
             text=second_texts,
             return_tensors=return_tensors,
             add_special_tokens=False,
-            **kwargs, )
-
-        encoded_texts = BatchEncoding({
-            "first_input_ids": first_text_encoding["input_ids"],
-            "first_attention_mask": first_text_encoding["attention_mask"],
-            "second_input_ids": second_text_encoding["input_ids"],
-            "second_attention_mask": second_text_encoding["attention_mask"],
-        })
+            **kwargs,
+        )
+
+        encoded_texts = BatchEncoding(
+            {
+                "first_input_ids": first_text_encoding["input_ids"],
+                "first_attention_mask": first_text_encoding["attention_mask"],
+                "second_input_ids": second_text_encoding["input_ids"],
+                "second_attention_mask": second_text_encoding["attention_mask"],
+            }
+        )
         return encoded_texts
 
     def __call__(
-            self,
-            images: ImageInput=None,
-            text: str=None,
-            prompt: str=None,
-            return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE,
-            **kwargs, ) -> BatchFeature:
+        self,
+        images: ImageInput = None,
+        text: str = None,
+        prompt: str = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
         """
         This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`LlamaTokenizer.__call__`] to prepare text for the model.
@@ -186,21 +192,17 @@ def __call__(
         prompt = prompt if prompt is not None else self.default_prompt
 
         if images is None and text is None:
-            raise ValueError(
-                "Images and text are None, you have to specify either images or texts."
-            )
-        if images is not None and not isinstance(
-                images, (Image.Image, np.ndarray, paddle.Tensor, list)):
+            raise ValueError("Images and text are None, you have to specify either images or texts.")
+        if images is not None and not isinstance(images, (Image.Image, np.ndarray, paddle.Tensor, list)):
             raise TypeError(
-                "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.".
-                format(type(images)))
+                "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.".format(
+                    type(images)
+                )
+            )
         if text is not None and not isinstance(text, str):
-            raise TypeError("A str type of text is expected, but received {}.".
-                            format(type(text)))
+            raise TypeError("A str type of text is expected, but received {}.".format(type(text)))
         if prompt is not None and not isinstance(prompt, str):
-            raise TypeError(
-                "A str type of prompt is expected, but received {}.".format(
-                    type(prompt)))
+            raise TypeError("A str type of prompt is expected, but received {}.".format(type(prompt)))
 
         if images is not None and not isinstance(images, list):
             images = [images]
@@ -214,8 +216,7 @@ def __call__(
         # image-only mode
         if text is None:
             # processing with image processor
-            processed_features = self.process_images(
-                images, return_tensors=return_tensors, **kwargs)
+            processed_features = self.process_images(images, return_tensors=return_tensors, **kwargs)
             return processed_features
 
         # text-only mode
@@ -225,8 +226,7 @@ def __call__(
             return encoded_texts
 
         # text-image mode
-        processed_features = self.image_processor(
-            images, return_tensors=return_tensors)
+        processed_features = self.image_processor(images, return_tensors=return_tensors)
         encoded_texts = self.process_texts(texts, prompts, **kwargs)
         processed_features.update(encoded_texts)
 
@@ -251,5 +251,4 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/paddlemix/processors/processing_utils.py b/paddlemix/processors/processing_utils.py
index dd755ce4c9447..4bce239cd1133 100644
--- a/paddlemix/processors/processing_utils.py
+++ b/paddlemix/processors/processing_utils.py
@@ -19,15 +19,23 @@
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_download,
-                             hf_hub_url, repo_type_and_id_from_hf_id,
-                             upload_folder)
+from huggingface_hub import (
+    create_repo,
+    get_hf_file_metadata,
+    hf_hub_download,
+    hf_hub_url,
+    repo_type_and_id_from_hf_id,
+    upload_folder,
+)
 from huggingface_hub.utils import EntryNotFoundError
 from paddlenlp import __version__
 from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding
 
 from paddlemix.utils.downloader import (
-    COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock, resolve_cache_dir)
+    COMMUNITY_MODEL_PREFIX,
+    get_path_from_url_with_filelock,
+    resolve_cache_dir,
+)
 from paddlemix.utils.log import logger
 
 PROCESSOR_CONFIG_MAPPING = {
@@ -63,9 +71,7 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: Union[str, os.PathLike],
-                        **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
         Instantiate a type of [`~processing_utils.BaseProcessingMixin`] from an processor.
 
@@ -121,13 +127,11 @@ def from_pretrained(cls,
         Returns:
             A processor of type [`~processing_utils.BaseProcessingMixin`].
         ```"""
-        processor_dict, kwargs = cls.get_processor_dict(
-            pretrained_model_name_or_path, **kwargs)
+        processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(processor_dict, **kwargs)
 
-    def save_pretrained(self, save_directory: Union[str, os.PathLike],
-                        **kwargs):
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
         """
         Save an processor object to the directory `save_directory`, so that it can be re-loaded using the
         [`~processing_utils.BaseProcessingMixin.from_pretrained`] class method.
@@ -139,15 +143,12 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike],
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
-            raise AssertionError(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
         os.makedirs(save_directory, exist_ok=True)
 
         # If we save using the predefined names, we can load using `from_pretrained`
-        output_processor_file = os.path.join(
-            save_directory, PROCESSOR_CONFIG_MAPPING[self.input_type])
+        output_processor_file = os.path.join(save_directory, PROCESSOR_CONFIG_MAPPING[self.input_type])
 
         self.to_json_file(output_processor_file)
         logger.info(f"processor saved in {output_processor_file}")
@@ -155,13 +156,14 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike],
         return [output_processor_file]
 
     def save_to_hf_hub(
-            self,
-            repo_id: str,
-            private: Optional[bool]=None,
-            subfolder: Optional[str]=None,
-            commit_message: Optional[str]=None,
-            revision: Optional[str]=None,
-            create_pr: bool=False, ):
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
         """
         Uploads all elements of this processor to a new HuggingFace Hub repository.
         Args:
@@ -186,9 +188,7 @@ def save_to_hf_hub(
 
         # Check if README file already exist in repo
         try:
-            get_hf_file_metadata(
-                hf_hub_url(
-                    repo_id=repo_id, filename="README.md", revision=revision))
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
             has_readme = True
         except EntryNotFoundError:
             has_readme = False
@@ -214,12 +214,13 @@ def save_to_hf_hub(
                 folder_path=root_dir,
                 commit_message=commit_message,
                 revision=revision,
-                create_pr=create_pr, )
+                create_pr=create_pr,
+            )
 
     @classmethod
     def get_processor_dict(
-            cls, pretrained_model_name_or_path: Union[str, os.PathLike],
-            **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
         processor of type [`~processor_utils.BaseProcessingMixin`] using `from_dict`.
@@ -237,15 +238,14 @@ def get_processor_dict(
         cache_dir = kwargs.pop("cache_dir", None)
         from_hf_hub = kwargs.pop("from_hf_hub", False)
         subfolder = kwargs.pop("subfolder", None)
-        cache_dir = resolve_cache_dir(pretrained_model_name_or_path,
-                                      from_hf_hub, cache_dir)
+        cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
 
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         is_local = os.path.isdir(pretrained_model_name_or_path)
         if os.path.isdir(pretrained_model_name_or_path):
             resolved_processor_file = os.path.join(
-                pretrained_model_name_or_path,
-                PROCESSOR_CONFIG_MAPPING[cls.input_type])
+                pretrained_model_name_or_path, PROCESSOR_CONFIG_MAPPING[cls.input_type]
+            )
         elif os.path.isfile(pretrained_model_name_or_path):
             resolved_processor_file = pretrained_model_name_or_path
             is_local = True
@@ -257,18 +257,20 @@ def get_processor_dict(
                 cache_dir=cache_dir,
                 subfolder=subfolder,
                 library_name="PaddleNLP",
-                library_version=__version__, )
+                library_version=__version__,
+            )
         else:
             # Assuming from community-contributed pretrained models
-            processor_file = "/".join([
-                COMMUNITY_MODEL_PREFIX,
-                pretrained_model_name_or_path,
-                PROCESSOR_CONFIG_MAPPING[cls.input_type],
-            ])
+            processor_file = "/".join(
+                [
+                    COMMUNITY_MODEL_PREFIX,
+                    pretrained_model_name_or_path,
+                    PROCESSOR_CONFIG_MAPPING[cls.input_type],
+                ]
+            )
             try:
                 # Load from local folder or from cache or download from model Hub and cache
-                resolved_processor_file = get_path_from_url_with_filelock(
-                    processor_file, cache_dir)
+                resolved_processor_file = get_path_from_url_with_filelock(processor_file, cache_dir)
             except EnvironmentError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
@@ -296,9 +298,7 @@ def get_processor_dict(
         if is_local:
             logger.info(f"loading configuration file {resolved_processor_file}")
         else:
-            logger.info(
-                f"loading configuration file {processor_file} from cache at {resolved_processor_file}"
-            )
+            logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
 
         return processor_dict, kwargs
 
@@ -416,8 +416,7 @@ def __call__(self, images, **kwargs) -> BatchEncoding:
         return self.preprocess(images, **kwargs)
 
     def preprocess(self, images, **kwargs) -> BatchEncoding:
-        raise NotImplementedError(
-            "Each image processor must implement its own preprocess method")
+        raise NotImplementedError("Each image processor must implement its own preprocess method")
 
 
 class BaseTextProcessor(BaseProcessingMixin):
@@ -431,8 +430,7 @@ def __call__(self, text, **kwargs) -> BatchEncoding:
         return self.preprocess(text, **kwargs)
 
     def preprocess(self, text, **kwargs) -> BatchEncoding:
-        raise NotImplementedError(
-            "Each image processor must implement its own preprocess method")
+        raise NotImplementedError("Each image processor must implement its own preprocess method")
 
 
 class BaseAudioProcessor(BaseProcessingMixin):
@@ -446,14 +444,14 @@ def __call__(self, audios, **kwargs) -> BatchEncoding:
         return self.preprocess(audios, **kwargs)
 
     def preprocess(self, audios, **kwargs) -> BatchEncoding:
-        raise NotImplementedError(
-            "Each audios processor must implement its own preprocess method")
+        raise NotImplementedError("Each audios processor must implement its own preprocess method")
 
 
 VALID_SIZE_DICT_KEYS = (
     {"height", "width"},
     {"shortest_edge"},
-    {"shortest_edge", "longest_edge"}, )
+    {"shortest_edge", "longest_edge"},
+)
 
 
 def is_valid_size_dict(size_dict):
@@ -468,16 +466,15 @@ def is_valid_size_dict(size_dict):
 
 
 def convert_to_size_dict(
-        size,
-        max_size: Optional[int]=None,
-        default_to_square: bool=True,
-        height_width_order: bool=True, ):
+    size,
+    max_size: Optional[int] = None,
+    default_to_square: bool = True,
+    height_width_order: bool = True,
+):
     # By default, if size is an int we assume it represents a tuple of (size, size).
     if isinstance(size, int) and default_to_square:
         if max_size is not None:
-            raise ValueError(
-                "Cannot specify both size as an int, with default_to_square=True and max_size"
-            )
+            raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size")
         return {"height": size, "width": size}
     # In other configs, if size is an int and default_to_square is False, size represents the length of
     # the shortest edge after resizing.
@@ -496,11 +493,12 @@ def convert_to_size_dict(
 
 
 def get_size_dict(
-        size: Union[int, Iterable[int], Dict[str, int]]=None,
-        max_size: Optional[int]=None,
-        height_width_order: bool=True,
-        default_to_square: bool=True,
-        param_name="size", ) -> dict:
+    size: Union[int, Iterable[int], Dict[str, int]] = None,
+    max_size: Optional[int] = None,
+    height_width_order: bool = True,
+    default_to_square: bool = True,
+    param_name="size",
+) -> dict:
     """
     Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
     compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
@@ -523,11 +521,11 @@ def get_size_dict(
             If `size` is an int, whether to default to a square image or not.
     """
     if not isinstance(size, dict):
-        size_dict = convert_to_size_dict(size, max_size, default_to_square,
-                                         height_width_order)
+        size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order)
         logger.info(
             f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
-            f" Converted to {size_dict}.", )
+            f" Converted to {size_dict}.",
+        )
     else:
         size_dict = size
 
diff --git a/paddlemix/processors/sam_processing.py b/paddlemix/processors/sam_processing.py
index 50ace480b7cfd..349d6d9e944a6 100644
--- a/paddlemix/processors/sam_processing.py
+++ b/paddlemix/processors/sam_processing.py
@@ -15,20 +15,22 @@
 Processor class for Sam.
 """
 
-import re
 from copy import deepcopy
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import paddle
-import PIL
 from paddle.nn import functional as F
 from paddle.vision.transforms.functional import resize
 
 from .base_processing import ProcessorMixin
 from .image_transform_utils import to_pil_image
-from .image_utils import (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD,
-                          get_preprocess_shape, valid_images)
+from .image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    get_preprocess_shape,
+    valid_images,
+)
 from .processing_utils import BaseImageProcessor, BaseTextProcessor
 
 __all__ = [
@@ -52,17 +54,17 @@ def __init__(self, image_processor, prompt_processor):
         self.encode_size = self.image_processor.size
 
     def __call__(
-            self,
-            images,
-            input_type,
-            point_coords=None,
-            point_labels=None,
-            box=None,
-            **kwargs, ):
+        self,
+        images,
+        input_type,
+        point_coords=None,
+        point_labels=None,
+        box=None,
+        **kwargs,
+    ):
 
         if images is None or input_type is None:
-            raise ValueError(
-                "You have to specify either images and input_type.")
+            raise ValueError("You have to specify either images and input_type.")
 
         if input_type == "boxs" and box is None:
             raise ValueError("You have to specify either box.")
@@ -78,20 +80,21 @@ def __call__(
             self.original_size,
             point_coords=point_coords,
             point_labels=point_labels,
-            box=box, )
+            box=box,
+        )
 
         return image_seg, prompt
 
-    def postprocess_masks(self, low_res_masks, mask_threshold: float=0.0):
+    def postprocess_masks(self, low_res_masks, mask_threshold: float = 0.0):
 
         masks = F.interpolate(
             paddle.to_tensor(low_res_masks),
             (self.encode_size, self.encode_size),
             mode="bilinear",
-            align_corners=False, )
-        masks = masks[..., :self.input_size[0], :self.input_size[1]]
-        masks = F.interpolate(
-            masks, self.original_size, mode="bilinear", align_corners=False)
+            align_corners=False,
+        )
+        masks = masks[..., : self.input_size[0], : self.input_size[1]]
+        masks = F.interpolate(masks, self.original_size, mode="bilinear", align_corners=False)
         masks = masks > mask_threshold
 
         return masks
@@ -108,28 +111,26 @@ class SamPromptProcessor(BaseTextProcessor):
     """
 
     def __init__(
-            self,
-            size: int=1024,
-            **kwargs, ):
+        self,
+        size: int = 1024,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self.size = size
 
-    def apply_coords(self, coords: np.ndarray,
-                     original_size: Tuple[int, ...]) -> np.ndarray:
+    def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
         """
         Expects a numpy array of length 2 in the final dimension. Requires the
         original image size in (H, W) format.
         """
         old_h, old_w = original_size
-        new_h, new_w = get_preprocess_shape(original_size[0], original_size[1],
-                                            self.size)
+        new_h, new_w = get_preprocess_shape(original_size[0], original_size[1], self.size)
         coords = deepcopy(coords).astype(float)
         coords[..., 0] = coords[..., 0] * (new_w / old_w)
         coords[..., 1] = coords[..., 1] * (new_h / old_h)
         return coords
 
-    def apply_boxes(self, boxes: np.ndarray,
-                    original_size: Tuple[int, ...]) -> np.ndarray:
+    def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
         """
         Expects a numpy array shape Bx4. Requires the original image size
         in (H, W) format.
@@ -138,17 +139,23 @@ def apply_boxes(self, boxes: np.ndarray,
         return boxes.reshape([-1, 4])
 
     def __call__(
-            self,
-            original_size,
-            point_coords=None,
-            point_labels=None,
-            box=None,
-            **kwargs, ):
-        coords_paddle, labels_paddle, box_paddle, mask_input_paddle = (
-            None,
+        self,
+        original_size,
+        point_coords=None,
+        point_labels=None,
+        box=None,
+        **kwargs,
+    ):
+        # coords_paddle, labels_paddle, box_paddle, mask_input_paddle = (
+        #     None,
+        #     None,
+        #     None,
+        #     None,
+        # )
+        coords_paddle, box_paddle = (
             None,
             None,
-            None, )
+        )
         if point_coords is not None:
             point_coords = self.apply_coords(point_coords, original_size)
             coords_paddle = paddle.to_tensor(point_coords).cast("float32")
@@ -171,22 +178,22 @@ class SamImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            size: List[int]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            image_format: str="RGB",
-            original_size: List[int]=None,
-            input_size: List[int]=None,
-            **kwargs, ) -> None:
+        self,
+        size: List[int] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        image_format: str = "RGB",
+        original_size: List[int] = None,
+        input_size: List[int] = None,
+        **kwargs,
+    ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else 1024
 
         self.size = size
         self.image_format = image_format
 
-        self.image_mean = (image_mean if image_mean is not None else
-                           IMAGENET_STANDARD_MEAN)
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
         self.original_size = original_size
@@ -196,19 +203,19 @@ def apply_image(self, image: np.ndarray) -> np.ndarray:
         """
         Expects a numpy array with shape HxWxC in uint8 format.
         """
-        target_size = get_preprocess_shape(image.shape[0], image.shape[1],
-                                           self.size)
+        target_size = get_preprocess_shape(image.shape[0], image.shape[1], self.size)
 
         return np.array(resize(to_pil_image(image), target_size))
 
     def preprocess(
-            self,
-            images,
-            size: Optional[Dict[str, int]]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            image_format: str="RGB",
-            **kwargs, ):
+        self,
+        images,
+        size: Optional[Dict[str, int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        image_format: str = "RGB",
+        **kwargs,
+    ):
         """
         Preprocess an image or batch of images.
 
@@ -221,13 +228,11 @@ def preprocess(
         if not isinstance(images, (list, tuple)):
             images = [images]
 
-        if isinstance(images[0], str):
-            images = [load_image(image) for image in images]
+        # if isinstance(images[0], str):
+        #     images = [load_image(image) for image in images]
 
         if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "paddle.Tensor.")
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
         assert image_format in [
             "RGB",
@@ -248,9 +253,8 @@ def preprocess(
         self.input_size = tuple(input_image_paddle.shape[-2:])
 
         input_image_paddle = (
-            input_image_paddle - paddle.to_tensor(self.image_mean).reshape(
-                [-1, 1, 1])) / paddle.to_tensor(self.image_std).reshape(
-                    [-1, 1, 1])
+            input_image_paddle - paddle.to_tensor(self.image_mean).reshape([-1, 1, 1])
+        ) / paddle.to_tensor(self.image_std).reshape([-1, 1, 1])
 
         # Pad
         h, w = input_image_paddle.shape[-2:]
diff --git a/paddlemix/processors/tokenizer.py b/paddlemix/processors/tokenizer.py
index 31c9b01b37444..fab9519a091b3 100644
--- a/paddlemix/processors/tokenizer.py
+++ b/paddlemix/processors/tokenizer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+
 """ CLIP tokenizer
 
 Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
@@ -31,9 +32,7 @@
 
 @lru_cache()
 def default_bpe():
-    return os.path.join(
-        os.path.dirname(os.path.abspath(__file__)),
-        "bpe_simple_vocab_16e6.txt.gz")
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 
 
 @lru_cache()
@@ -47,9 +46,9 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
-    bs = (list(range(ord("!"), ord("~") + 1)) +
-          list(range(ord("¡"), ord("¬") + 1)) +
-          list(range(ord("®"), ord("ÿ") + 1)))
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
     cs = bs[:]
     n = 0
     for b in range(2**8):
@@ -87,11 +86,11 @@ def whitespace_clean(text):
 
 
 class SimpleTokenizer(object):
-    def __init__(self, bpe_path: str=default_bpe(), special_tokens=None):
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
         merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
-        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = merges[1 : 49152 - 256 - 2 + 1]
         """Class Method: *.split, not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*, and convert manually"""
         merges = [tuple(merge.split()) for merge in merges]
         vocab = list(bytes_to_unicode().values())
@@ -101,8 +100,7 @@ def __init__(self, bpe_path: str=default_bpe(), special_tokens=None):
         if not special_tokens:
             special_tokens = ["<start_of_text>", "<end_of_text>"]
         else:
-            special_tokens = ["<start_of_text>", "<end_of_text>"
-                              ] + special_tokens
+            special_tokens = ["<start_of_text>", "<end_of_text>"] + special_tokens
         vocab.extend(special_tokens)
         self.encoder = dict(zip(vocab, range(len(vocab))))
         self.decoder = {v: k for k, v in self.encoder.items()}
@@ -110,22 +108,21 @@ def __init__(self, bpe_path: str=default_bpe(), special_tokens=None):
         self.cache = {t: t for t in special_tokens}
         special = "|".join(special_tokens)
         self.pat = re.compile(
-            special +
-            "|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+",
-            re.IGNORECASE, )
+            special + "|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+",
+            re.IGNORECASE,
+        )
         self.vocab_size = len(self.encoder)
         self.all_special_ids = [self.encoder[t] for t in special_tokens]
 
     def bpe(self, token):
         if token in self.cache:
             return self.cache[token]
-        word = tuple(token[:-1]) + (token[-1] + "</w>", )
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
         pairs = get_pairs(word)
         if not pairs:
             return token + "</w>"
         while True:
-            bigram = min(
-                pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -139,8 +136,7 @@ def bpe(self, token):
                 except:
                     new_word.extend(word[i:])
                     break
-                if word[i] == first and i < len(word) - 1 and word[i +
-                                                                   1] == second:
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                     new_word.append(first + second)
                     i += 2
                 else:
@@ -162,30 +158,26 @@ def encode(self, text1):
         for token in re.findall(self.pat, text):
             token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
             """Class Method: *.split, not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*, and convert manually"""
-            bpe_tokens.extend(self.encoder[bpe_token]
-                              for bpe_token in self.bpe(token).split(" "))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
         return bpe_tokens
 
     def decode(self, tokens):
         text = "".join([self.decoder[token] for token in tokens])
-        text = (bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors="replace").replace("</w>", " "))
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
         return text
 
     def __call__(self, text, max_length=77, return_tensors=True, **kwargs):
         texts = text
         sot_token = self.encoder["<start_of_text>"]
         eot_token = self.encoder["<end_of_text>"]
-        all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token])
-                      for text in texts]
+        all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token]) for text in texts]
         if return_tensors:
-            result = paddle.zeros(
-                shape=[len(all_tokens), max_length], dtype="int64")
+            result = paddle.zeros(shape=[len(all_tokens), max_length], dtype="int64")
             for i, tokens in enumerate(all_tokens):
                 if len(tokens) > max_length:
                     tokens = tokens[:max_length]
                     tokens[-1] = eot_token
-                result[(i), :len(tokens)] = paddle.to_tensor(data=tokens)
+                result[(i), : len(tokens)] = paddle.to_tensor(data=tokens)
             return {"input_ids": result}
         else:
             result = []
@@ -204,8 +196,7 @@ def from_pretrained(cls, *args, **kwargs):
 _tokenizer = SimpleTokenizer()
 
 
-def tokenize(texts: Union[str, List[str]],
-             context_length: int=77) -> paddle.Tensor:
+def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> paddle.Tensor:
     """
     Returns the tokenized representation of given input string(s)
 
@@ -224,15 +215,13 @@ def tokenize(texts: Union[str, List[str]],
         texts = [texts]
     sot_token = _tokenizer.encoder["<start_of_text>"]
     eot_token = _tokenizer.encoder["<end_of_text>"]
-    all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token])
-                  for text in texts]
-    result = paddle.zeros(
-        shape=[len(all_tokens), context_length], dtype="int64")
+    all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token]) for text in texts]
+    result = paddle.zeros(shape=[len(all_tokens), context_length], dtype="int64")
     for i, tokens in enumerate(all_tokens):
         if len(tokens) > context_length:
             tokens = tokens[:context_length]
             tokens[-1] = eot_token
-        result[(i), :len(tokens)] = paddle.to_tensor(data=tokens)
+        result[(i), : len(tokens)] = paddle.to_tensor(data=tokens)
     return result
 
 
@@ -244,8 +233,7 @@ def __init__(self, tokenizer_name: str):
 
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
-    def __call__(self, texts: Union[str, List[str]],
-                 context_length: int=77) -> paddle.Tensor:
+    def __call__(self, texts: Union[str, List[str]], context_length: int = 77) -> paddle.Tensor:
         if isinstance(texts, str):
             texts = [texts]
         texts = [whitespace_clean(basic_clean(text)) for text in texts]
@@ -254,5 +242,6 @@ def __call__(self, texts: Union[str, List[str]],
             return_tensors="pt",
             max_length=context_length,
             padding="max_length",
-            truncation=True, ).input_ids
+            truncation=True,
+        ).input_ids
         return input_ids
diff --git a/paddlemix/processors/utils.py b/paddlemix/processors/utils.py
index 29ae623096eff..53fe7051435f1 100644
--- a/paddlemix/processors/utils.py
+++ b/paddlemix/processors/utils.py
@@ -28,7 +28,6 @@ def _missing_(cls, value):
 
 
 def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
     maxes = the_list[0]
     for sublist in the_list[1:]:
         for index, item in enumerate(sublist):
diff --git a/paddlemix/processors/visualglm_image_processing.py b/paddlemix/processors/visualglm_image_processing.py
index 2ed3464393e07..7e0afc9ec6ef6 100644
--- a/paddlemix/processors/visualglm_image_processing.py
+++ b/paddlemix/processors/visualglm_image_processing.py
@@ -21,14 +21,26 @@
 import PIL
 from paddlenlp.transformers.tokenizer_utils_base import TensorType
 
-from .image_processing_utils import (BaseImageProcessor, BatchFeature,
-                                     get_size_dict)
-from .image_transforms import (convert_to_rgb, normalize, rescale, resize,
-                               to_channel_dimension_format)
-from .image_utils import (ChannelDimension, ImageInput, PILImageResampling,
-                          is_batched, to_numpy_array, valid_images)
-
-__all__ = ["VisualGLMImageProcessor", ]
+from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from .image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+
+__all__ = [
+    "VisualGLMImageProcessor",
+]
 
 
 class VisualGLMImageProcessor(BaseImageProcessor):
@@ -69,17 +81,18 @@ class VisualGLMImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
 
     def __init__(
-            self,
-            do_resize: bool=True,
-            size: Dict[str, int]=None,
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            do_rescale: bool=True,
-            rescale_factor: Union[int, float]=1 / 255,
-            do_normalize: bool=True,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            do_convert_rgb: bool=True,
-            **kwargs, ) -> None:
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
         super().__init__(**kwargs)
         default_image_mean = [0.48145466, 0.4578275, 0.40821073]
         default_image_std = [0.26862954, 0.26130258, 0.27577711]
@@ -97,12 +110,13 @@ def __init__(
         self.do_convert_rgb = do_convert_rgb
 
     def resize(
-            self,
-            image: np.ndarray,
-            size: Dict[str, int],
-            resample: PILImageResampling=PILImageResampling.BICUBIC,
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Resize an image.
 
@@ -127,14 +141,16 @@ def resize(
             size=output_size,
             resample=resample,
             data_format=data_format,
-            **kwargs, )
+            **kwargs,
+        )
 
     def rescale(
-            self,
-            image: np.ndarray,
-            scale: Union[int, float],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ):
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
         """
         Rescale an image by a scale factor. image = image * scale.
 
@@ -149,12 +165,13 @@ def rescale(
         return rescale(image, scale=scale, data_format=data_format, **kwargs)
 
     def normalize(
-            self,
-            image: np.ndarray,
-            mean: Union[float, List[float]],
-            std: Union[float, List[float]],
-            data_format: Optional[Union[str, ChannelDimension]]=None,
-            **kwargs, ) -> np.ndarray:
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
 
@@ -168,24 +185,24 @@ def normalize(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        return normalize(
-            image, mean=mean, std=std, data_format=data_format, **kwargs)
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
     def preprocess(
-            self,
-            images: ImageInput,
-            do_resize: Optional[bool]=None,
-            size: Optional[Dict[str, int]]=None,
-            resample: PILImageResampling=None,
-            do_rescale: Optional[bool]=None,
-            rescale_factor: Optional[float]=None,
-            do_normalize: Optional[bool]=None,
-            image_mean: Optional[Union[float, List[float]]]=None,
-            image_std: Optional[Union[float, List[float]]]=None,
-            return_tensors: Optional[Union[str, TensorType]]=None,
-            do_convert_rgb: bool=None,
-            data_format: ChannelDimension=ChannelDimension.FIRST,
-            **kwargs, ) -> PIL.Image.Image:
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
 
@@ -227,13 +244,11 @@ def preprocess(
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (rescale_factor if rescale_factor is not None else
-                          self.rescale_factor)
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else
-                          self.do_convert_rgb)
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
@@ -242,21 +257,16 @@ def preprocess(
             images = [images]
 
         if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "paddle.Tensor.")
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
         if do_resize and size is None or resample is None:
-            raise ValueError(
-                "Size and resample must be specified if do_resize is True.")
+            raise ValueError("Size and resample must be specified if do_resize is True.")
 
         if do_rescale and rescale_factor is None:
-            raise ValueError(
-                "Rescale factor must be specified if do_rescale is True.")
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
 
         if do_normalize and (image_mean is None or image_std is None):
-            raise ValueError(
-                "Image mean and std must be specified if do_normalize is True.")
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
 
         # PIL RGBA images are converted to RGB
         if do_convert_rgb:
@@ -266,28 +276,15 @@ def preprocess(
         images = [to_numpy_array(image) for image in images]
 
         if do_resize:
-            images = [
-                self.resize(
-                    image=image, size=size, resample=resample)
-                for image in images
-            ]
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
 
         if do_rescale:
-            images = [
-                self.rescale(
-                    image=image, scale=rescale_factor) for image in images
-            ]
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
 
         if do_normalize:
-            images = [
-                self.normalize(
-                    image=image, mean=image_mean, std=image_std)
-                for image in images
-            ]
-
-        images = [
-            to_channel_dimension_format(image, data_format) for image in images
-        ]
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/paddlemix/processors/visualglm_processing.py b/paddlemix/processors/visualglm_processing.py
index 5295186a7eab0..e26d1302e45ae 100644
--- a/paddlemix/processors/visualglm_processing.py
+++ b/paddlemix/processors/visualglm_processing.py
@@ -21,15 +21,20 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding,
-                                                         TensorType, TextInput)
+from paddlenlp.transformers.tokenizer_utils_base import (
+    BatchEncoding,
+    TensorType,
+    TextInput,
+)
 from PIL import Image
 
 from .base_processing import ProcessorMixin
 from .image_processing_utils import BatchFeature
 from .image_utils import ImageInput
 
-__all__ = ["VisualGLMProcessor", ]
+__all__ = [
+    "VisualGLMProcessor",
+]
 
 
 class VisualGLMProcessor(ProcessorMixin):
@@ -78,10 +83,11 @@ def __init__(self, image_processor, tokenizer):
         self.num_query_tokens = 32
 
     def process_images(
-            self,
-            images: ImageInput,
-            return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE,
-            **kwargs, ) -> BatchFeature:
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
         """
         This method uses [`VisualGLMImageProcessor.__call__`] method to prepare image(s) for the model.
         Please refer to the docstring of the method for more information.
@@ -92,31 +98,31 @@ def process_images(
         if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)):
             images = [images]
 
-        processed_images = self.image_processor(
-            images, return_tensors=return_tensors)
+        processed_images = self.image_processor(images, return_tensors=return_tensors)
 
         return processed_images
 
     def process_texts(
-            self,
-            texts: Union[TextInput, List[TextInput]],
-            return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE,
-            **kwargs, ) -> BatchEncoding:
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchEncoding:
         if not texts:
             raise ValueError("You have to input correct texts.")
 
         if isinstance(texts, TextInput):
             texts = [texts]
 
-        processed_texts = self.tokenizer(
-            text=texts, return_tensors=return_tensors, **kwargs)
+        processed_texts = self.tokenizer(text=texts, return_tensors=return_tensors, **kwargs)
         return BatchEncoding(processed_texts)
 
     def build_inputs_with_image(
-            self,
-            image: Union[Image.Image, np.ndarray, paddle.Tensor],
-            query: str,
-            history: Optional[str]=None, ):
+        self,
+        image: Union[Image.Image, np.ndarray, paddle.Tensor],
+        query: str,
+        history: Optional[str] = None,
+    ):
         # construct prompt with inputs
         if image is not None:
             prompt = self.default_prompt
@@ -129,22 +135,17 @@ def build_inputs_with_image(
         if image is not None:
             image_start_position = prompt.rfind(self.image_tag)
             image_end_position = image_start_position + len(self.image_tag)
-            first_text_input = self.tokenizer.encode(
-                prompt[:image_start_position], add_special_tokens=False)
+            first_text_input = self.tokenizer.encode(prompt[:image_start_position], add_special_tokens=False)
             image_input = [self.tokenizer.unk_token_id] * self.num_query_tokens
-            second_text_input = self.tokenizer.encode(
-                prompt[image_end_position:], add_special_tokens=False)
-            all_input_ids = (first_text_input["input_ids"] + image_input +
-                             second_text_input["input_ids"])
-            all_input_ids = self.tokenizer.build_inputs_with_special_tokens(
-                all_input_ids)
+            second_text_input = self.tokenizer.encode(prompt[image_end_position:], add_special_tokens=False)
+            all_input_ids = first_text_input["input_ids"] + image_input + second_text_input["input_ids"]
+            all_input_ids = self.tokenizer.build_inputs_with_special_tokens(all_input_ids)
 
             # processing image
             processed_image = self.process_images(image)
 
             inputs = {
-                "input_ids": paddle.to_tensor(
-                    all_input_ids, dtype="int64").unsqueeze(0),
+                "input_ids": paddle.to_tensor(all_input_ids, dtype="int64").unsqueeze(0),
                 "pre_image_length": len(first_text_input["input_ids"]),
                 "pixel_values": processed_image["pixel_values"],
             }
@@ -155,23 +156,24 @@ def build_inputs_with_image(
         return inputs
 
     def __call__(
-            self,
-            image: Union[Image.Image, np.ndarray, paddle.Tensor],
-            query: str,
-            history: Optional[str]=[],
-            **kwargs, ):
+        self,
+        image: Union[Image.Image, np.ndarray, paddle.Tensor],
+        query: str,
+        history: Optional[str] = [],
+        **kwargs,
+    ):
         if image is None:
             raise ValueError("Image should not be None.")
         if query is None:
             raise ValueError("Query should not be None.")
         if not isinstance(query, str):
-            raise TypeError(
-                "A string type of query is expected, but acceived {}.".format(
-                    type(query)))
+            raise TypeError("A string type of query is expected, but acceived {}.".format(type(query)))
         if not isinstance(history, list):
             raise TypeError(
-                "A list type of history is expected with each item [query, response] in it, but acceived {}.".
-                format(type(history)))
+                "A list type of history is expected with each item [query, response] in it, but acceived {}.".format(
+                    type(history)
+                )
+            )
 
         inputs = self.build_inputs_with_image(image, query, history=history)
 
@@ -203,10 +205,8 @@ def process_response(self, response):
             ["\?", "？"],
         ]
         for item in punkts:
-            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0],
-                              r"\1%s" % item[1], response)
-            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0],
-                              r"%s\1" % item[1], response)
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
         return response
 
     def get_responses(self, *args, **kwargs):
@@ -223,5 +223,4 @@ def get_responses(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/paddlemix/trainer/blip2_trainer.py b/paddlemix/trainer/blip2_trainer.py
index 708236f547e58..b3b9a6a831e4e 100644
--- a/paddlemix/trainer/blip2_trainer.py
+++ b/paddlemix/trainer/blip2_trainer.py
@@ -11,39 +11,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddlemix
-from paddlenlp.trainer.trainer import Trainer
-from paddlemix.optimization import FilterParamsName
-from paddlemix.examples.blip2.utils import coco_caption_eval
-
 import contextlib
 import inspect
-import math
+import json
 import os
 import sys
 import time
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import paddle
 import paddle.amp.auto_cast as autocast
 import paddle.nn as nn
 from paddle.distributed import fleet
-from paddle.io import DataLoader, Dataset, DistributedBatchSampler
-
+from paddle.io import DataLoader, Dataset
+from paddlenlp.trainer.trainer import Trainer
+from paddlenlp.trainer.trainer_callback import DefaultFlowCallback, ProgressCallback
+from paddlenlp.trainer.trainer_utils import (  # set_hyrbid_parallel_seed,
+    EvalLoopOutput,
+    IterableDatasetShard,
+    ShardingOption,
+    has_length,
+    speed_metrics,
+)
 from paddlenlp.transformers.model_utils import unwrap_model
 from paddlenlp.utils import device_guard
-from paddlenlp.utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
 from paddlenlp.utils.import_utils import is_datasets_available
 from paddlenlp.utils.log import logger
-from paddlenlp.trainer.trainer_callback import (
-    DefaultFlowCallback,
-    ProgressCallback, )
-from paddlenlp.trainer.trainer_utils import (  # set_hyrbid_parallel_seed,
-    EvalLoopOutput, EvalPrediction, IterableDatasetShard, ShardingOption,
-    find_batch_size, has_length, speed_metrics, )
-import json
-from paddlemix.examples.blip2.utils import save_result, VQA, VQAEval
+
+import paddlemix
+from paddlemix.examples.blip2.utils import VQA, VQAEval, coco_caption_eval, save_result
+from paddlemix.optimization import FilterParamsName
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
 DEFAULT_PROGRESS_CALLBACK = ProgressCallback
@@ -74,9 +71,7 @@ def paddlenlp_load(path, return_numpy=False):
 
 
 def is_dp_group_support_in_group_sharded_parallel():
-    return "dp_group" in set(
-        inspect.signature(paddle.distributed.sharding.group_sharded_parallel)
-        .parameters.keys())
+    return "dp_group" in set(inspect.signature(paddle.distributed.sharding.group_sharded_parallel).parameters.keys())
 
 
 __all__ = ["BLIP2Trainer"]
@@ -93,21 +88,21 @@ class BLIP2Trainer(Trainer):
 
     """
 
-    from paddlenlp.trainer.trainer_utils import log_metrics, metrics_format, save_metrics, save_state
+    from paddlenlp.trainer.trainer_utils import (
+        log_metrics,
+        metrics_format,
+        save_metrics,
+        save_state,
+    )
 
-    def __init__(self,
-                 processor=None,
-                 eval_processor=None,
-                 eval_collator=None,
-                 **kwargs):
+    def __init__(self, processor=None, eval_processor=None, eval_collator=None, **kwargs):
         super().__init__(**kwargs)
         self.processor = processor
         self.eval_processor = eval_processor
         self.eval_collator = eval_collator
 
     def create_optimizer_and_scheduler(self, num_training_steps: int):
-        self.lr_scheduler = self.create_scheduler(num_training_steps //
-                                                  self.args.num_train_epochs)
+        self.lr_scheduler = self.create_scheduler(num_training_steps // self.args.num_train_epochs)
         param_filter = FilterParamsName()
         p_wd, p_non_wd = param_filter(self.model)
         self.optimizer = paddle.optimizer.AdamW(
@@ -116,22 +111,22 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
             weight_decay=float(self.args.weight_decay),
             beta1=self.args.adam_beta1,
             beta2=self.args.adam_beta2,
-            apply_decay_param_fun=param_filter._apply_decay_param_fun, )
+            apply_decay_param_fun=param_filter._apply_decay_param_fun,
+        )
 
     def create_scheduler(self, num_training_steps):
-        lr_sched_func = getattr(paddlemix.optimization,
-                                self.args.lr_scheduler_name)
+        lr_sched_func = getattr(paddlemix.optimization, self.args.lr_scheduler_name)
         lr_sched = lr_sched_func(
             learning_rate=self.args.learning_rate,
             epochs=self.args.num_train_epochs,
             warmup_start_lr=self.args.warmup_start_lr,
             eta_min=self.args.eta_min,
             warmup_steps=self.args.warmup_steps,
-            step_each_epoch=num_training_steps, )
+            step_each_epoch=num_training_steps,
+        )
         return lr_sched
 
-    def get_eval_dataloader(self,
-                            eval_dataset: Optional[Dataset]=None) -> DataLoader:
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
         """
         Returns the evaluation [`~paddle.io.DataLoader`].
 
@@ -146,10 +141,8 @@ def get_eval_dataloader(self,
             raise ValueError("Trainer: evaluation requires an eval_dataset.")
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
 
-        if is_datasets_available() and isinstance(eval_dataset,
-                                                  datasets.Dataset):
-            eval_dataset = self._remove_unused_columns(
-                eval_dataset, description="evaluation")
+        if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
+            eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
 
         if self._is_iterable_dataset(eval_dataset):
             if self.args.dataset_world_size > 1:
@@ -158,13 +151,15 @@ def get_eval_dataloader(self,
                     batch_size=self.args.per_device_eval_batch_size,
                     drop_last=self.args.dataloader_drop_last,
                     num_processes=self.args.dataset_world_size,
-                    process_index=self.args.dataset_rank, )
+                    process_index=self.args.dataset_rank,
+                )
 
             return DataLoader(
                 eval_dataset,
                 batch_size=self.args.per_device_eval_batch_size,
                 collate_fn=self.eval_collator,
-                num_workers=self.args.dataloader_num_workers, )
+                num_workers=self.args.dataloader_num_workers,
+            )
 
         eval_sampler = self._get_eval_sampler(eval_dataset)
 
@@ -172,7 +167,8 @@ def get_eval_dataloader(self,
             eval_dataset,
             batch_sampler=eval_sampler,
             collate_fn=self.eval_collator,
-            num_workers=self.args.dataloader_num_workers, )
+            num_workers=self.args.dataloader_num_workers,
+        )
 
     def _wrap_model(self, model, training=True):
 
@@ -190,15 +186,11 @@ def _wrap_model(self, model, training=True):
             # model, self.optimizer
             if hasattr(model, "language_model"):
                 decorated = paddle.amp.decorate(
-                    models=[model.visual_encoder, model.language_model],
-                    optimizers=self.optimizer,
-                    level="O2")
+                    models=[model.visual_encoder, model.language_model], optimizers=self.optimizer, level="O2"
+                )
                 model.visual_encoder, model.language_model = decorated[0]
             else:
-                decorated = paddle.amp.decorate(
-                    models=[model.visual_encoder],
-                    optimizers=self.optimizer,
-                    level="O2")
+                decorated = paddle.amp.decorate(models=[model.visual_encoder], optimizers=self.optimizer, level="O2")
                 model.visual_encoder = decorated[0][0]
             self.optimizer.set_state_dict(decorated[1].state_dict())
 
@@ -206,34 +198,32 @@ def _wrap_model(self, model, training=True):
         if self.args.world_size > 1 and not self.args.use_hybrid_parallel:
             model = paddle.DataParallel(model)
             assert self.args.tensor_parallel_degree < 2, "tensor_parallel_degree = {}, pelease init optimizer.".format(
-                self.args.tensor_parallel_degree)
+                self.args.tensor_parallel_degree
+            )
+
         in_pipeline_parallel_mode = self.args.pipeline_parallel_degree > 1
         in_sharding_parallel_mode = self.sharding is not None
         in_tensor_parallel_model = self.args.tensor_parallel_degree > 1
         if in_pipeline_parallel_mode:
             if self.args.amp_master_grad:
-                mix_precision_utils.MixPrecisionLayer(
-                    model, dtype=self.amp_dtype)  # return value has no use
+                mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use
             # hack for pipeline model mini batch to batch
             # need batter solution @ZHUI
             # make batch_fn compatible for fleet.distributed_model decorate.
             prepare_pipeline_inputs_func = (
-                model._prepare_pipeline_inputs_func
-                if hasattr(model, "_prepare_pipeline_inputs_func") else None)
+                model._prepare_pipeline_inputs_func if hasattr(model, "_prepare_pipeline_inputs_func") else None
+            )
             model = fleet.distributed_model(model)
             if prepare_pipeline_inputs_func is not None:
                 model._prepare_pipeline_inputs_func = prepare_pipeline_inputs_func
             else:
 
                 def _prepare_pipeline_inputs_func(inputs):
-                    first_stage_keys = [
-                        "input_ids", "attention_mask", "position_ids"
-                    ]
+                    first_stage_keys = ["input_ids", "attention_mask", "position_ids"]
                     last_stage_keys = ["labels"]
 
                     def get_expected_keys(inputs, keys):
-                        ret = tuple(
-                            [inputs.pop(k) for k in keys if k in inputs])
+                        ret = tuple([inputs.pop(k) for k in keys if k in inputs])
                         if len(ret) == 1:
                             ret = ret[0]
                         return ret
@@ -245,10 +235,7 @@ def get_expected_keys(inputs, keys):
                         ]
 
                     keys = list(inputs[0].keys())
-                    inputs_batch = {
-                        key: [data.pop(key) for data in inputs]
-                        for key in keys
-                    }
+                    inputs_batch = {key: [data.pop(key) for data in inputs] for key in keys}
                     return [
                         get_expected_keys(inputs_batch, first_stage_keys),
                         get_expected_keys(inputs_batch, last_stage_keys),
@@ -261,8 +248,7 @@ def get_expected_keys(inputs, keys):
 
             assert self.optimizer is not None, "Pipeline mode need decorate optimizer, pelease init optimizer."
             if self.args.amp_master_grad:
-                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(
-                    self.optimizer)
+                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
             self.optimizer = fleet.distributed_optimizer(self.optimizer)
 
         # No pipeline mode, sharding only
@@ -271,19 +257,17 @@ def get_expected_keys(inputs, keys):
             if self.args.tensor_parallel_degree > 1:
                 hcg = fleet.get_hybrid_communicate_group()
                 assert (
-                    ShardingOption.SHARD_GRAD_OP in self.args.sharding or
-                    ShardingOption.SHARD_OP in self.args.sharding
+                    ShardingOption.SHARD_GRAD_OP in self.args.sharding or ShardingOption.SHARD_OP in self.args.sharding
                 ), "Only support tensor parallel + sharding stage1/stage2 hybrid parallel now."
-                model = paddle.distributed.fleet.meta_parallel.TensorParallel(
-                    model, hcg, strategy=None)
+                model = paddle.distributed.fleet.meta_parallel.TensorParallel(model, hcg, strategy=None)
 
             if ShardingOption.SHARD_OP in self.args.sharding:
                 model = fleet.distributed_model(model)
                 self.optimizer = fleet.distributed_optimizer(self.optimizer)
             else:
                 # sync params (broadcast) buffers in dp group
-                if not is_dp_group_support_in_group_sharded_parallel(
-                ) and self.args.data_parallel_degree > 1:
+
+                if not is_dp_group_support_in_group_sharded_parallel() and self.args.data_parallel_degree > 1:
                     try:
                         from paddle.fluid.dygraph.parallel import sync_params_buffers
                     except ImportError:
@@ -292,8 +276,7 @@ def get_expected_keys(inputs, keys):
 
                     hcg = fleet.get_hybrid_communicate_group()
                     dp_group = hcg.get_data_parallel_group()
-                    sync_params_buffers(
-                        model, comm_group=dp_group, src_rank=dp_group.ranks[0])
+                    sync_params_buffers(model, comm_group=dp_group, src_rank=dp_group.ranks[0])
 
                 cpu_offload = ShardingOption.OFFLOAD in self.args.sharding
                 assert self.optimizer is not None, "optimizer is empty!"
@@ -319,20 +302,19 @@ def get_expected_keys(inputs, keys):
                     scaler=None,
                     group=self.sharding_group,
                     offload=cpu_offload,
-                    **extra_kwargs, )
+                    **extra_kwargs,
+                )
                 self.optimizer = optimizer
 
         # pure tesnor parallel mode, no pipeline_parallel, no sharding.
         if not in_pipeline_parallel_mode and not in_sharding_parallel_mode and in_tensor_parallel_model:
             if self.args.amp_master_grad:
-                mix_precision_utils.MixPrecisionLayer(
-                    model, dtype=self.amp_dtype)  # return value has no use
+                mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use
 
             model = fleet.distributed_model(model)
             assert self.optimizer is not None, "Tensor parallel mode need decorate optimizer, pelease init optimizer."
             if self.args.amp_master_grad:
-                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(
-                    self.optimizer)
+                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
             self.optimizer = fleet.distributed_optimizer(self.optimizer)
         return model
 
@@ -342,18 +324,21 @@ def autocast_smart_context_manager(self):
         arguments, depending on the situation.
         """
         if self.enable_autocast_context_manager:
-            ctx_manager = autocast(True, )
+            ctx_manager = autocast(
+                True,
+            )
         else:
-            ctx_manager = contextlib.nullcontext() if sys.version_info >= (
-                3, 7) else contextlib.suppress()
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
         return ctx_manager
 
-    def evaluate(self,
-                 eval_dataset: Optional[Dataset]=None,
-                 ignore_keys: Optional[List[str]]=None,
-                 metric_key_prefix: str="eval",
-                 task_name="coco_caption") -> Dict[str, float]:
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        task_name="coco_caption",
+    ) -> Dict[str, float]:
         """
         Run evaluation and returns metrics.
 
@@ -382,7 +367,7 @@ def evaluate(self,
         self._memory_tracker.start()
         self.task_name = task_name
         if isinstance(eval_dataset, dict):
-            eval_dataset = eval_dataset['test']
+            eval_dataset = eval_dataset["test"]
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
         start_time = time.time()
 
@@ -390,30 +375,34 @@ def evaluate(self,
             eval_dataloader,
             description="Evaluation",
             ignore_keys=ignore_keys,
-            metric_key_prefix=metric_key_prefix, )
+            metric_key_prefix=metric_key_prefix,
+        )
 
-        total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size
-        output.metrics.update(speed_metrics(
-            metric_key_prefix,
-            start_time, ))
+        # total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+            )
+        )
 
         self.log(output.metrics)
 
-        self.control = self.callback_handler.on_evaluate(
-            self.args, self.state, self.control, output.metrics)
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
 
         self._memory_tracker.stop_and_update_metrics(output.metrics)
 
         return output.metrics
 
     def evaluation_loop(
-            self,
-            dataloader: DataLoader,
-            description: str,
-            prediction_loss_only: Optional[bool]=None,
-            ignore_keys: Optional[List[str]]=None,
-            metric_key_prefix: str="eval",
-            max_eval_iters: Optional[int]=-1, ) -> EvalLoopOutput:
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        max_eval_iters: Optional[int] = -1,
+    ) -> EvalLoopOutput:
         """
         Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
 
@@ -427,9 +416,7 @@ def evaluation_loop(
 
         if isinstance(dataloader, paddle.io.DataLoader):
             batch_size = dataloader.batch_sampler.batch_size
-        elif isinstance(
-                dataloader,
-                paddle.fluid.dataloader.dataloader_iter._DataLoaderIterBase):
+        elif isinstance(dataloader, paddle.fluid.dataloader.dataloader_iter._DataLoaderIterBase):
             # support for inner dataloader
             batch_size = dataloader._batch_sampler.batch_size
             # alias for inner dataloader
@@ -450,8 +437,7 @@ def evaluation_loop(
                 logger.info(f"  Total prediction steps = {max_eval_iters}")
 
         logger.info(f"  Pre device batch size = {batch_size}")
-        logger.info(
-            f"  Total Batch size = {batch_size * self.args.dataset_world_size}")
+        logger.info(f"  Total Batch size = {batch_size * self.args.dataset_world_size}")
 
         model.eval()
 
@@ -464,8 +450,7 @@ def evaluation_loop(
             # Prediction step
             eval_output = self.prediction_step(model, inputs)
             results.extend(eval_output)
-            self.control = self.callback_handler.on_prediction_step(
-                args, self.state, self.control)
+            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
             if max_eval_iters > 0 and step >= max_eval_iters - 1:
                 break
         if results is not None:
@@ -473,15 +458,13 @@ def evaluation_loop(
         else:
             metrics = None
 
-        return EvalLoopOutput(
-            predictions=None, label_ids=None, metrics=metrics, num_samples=None)
+        return EvalLoopOutput(predictions=None, label_ids=None, metrics=metrics, num_samples=None)
 
     def prediction_step(
-            self,
-            model: nn.Layer,
-            inputs: Dict[str, Union[paddle.Tensor, Any]], ) -> Tuple[Optional[
-                paddle.Tensor], Optional[paddle.Tensor], Optional[
-                    paddle.Tensor]]:
+        self,
+        model: nn.Layer,
+        inputs: Dict[str, Union[paddle.Tensor, Any]],
+    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
 
@@ -505,27 +488,23 @@ def prediction_step(
             with paddle.no_grad():
                 # with paddle.amp.auto_cast(level='O2'):
                 model_inputs = self.eval_processor(
-                    text=[""] * inputs['pixel_values'].shape[0],
+                    text=[""] * inputs["pixel_values"].shape[0],
                     return_tensors="pd",
                     return_attention_mask=True,
-                    mode="test", )
+                    mode="test",
+                )
                 model_inputs.update(inputs)
                 generated_ids, scores = model.generate(**model_inputs)
-                generated_text = self.processor.batch_decode(
-                    generated_ids, skip_special_tokens=True)
+                generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
                 generated_text = [text.strip() for text in generated_text]
-                for caption, img_id in zip(generated_text, inputs['image_id']):
-                    results.append({
-                        "caption": caption,
-                        "image_id": int(img_id)
-                    })
+                for caption, img_id in zip(generated_text, inputs["image_id"]):
+                    results.append({"caption": caption, "image_id": int(img_id)})
         elif "vqa" in self.task_name:
             with paddle.no_grad():
                 # with paddle.amp.auto_cast(level='O2'):
                 model_inputs = inputs
                 generated_ids, scores = model.predict_answers(**model_inputs)
-                answers = self.processor.batch_decode(
-                    generated_ids, skip_special_tokens=True)
+                answers = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
                 answers = [text.strip() for text in answers]
                 question_id = inputs["question_id"]
                 for answer, ques_id in zip(answers, question_id):
@@ -540,21 +519,21 @@ def after_evaluation(self, val_result):
             eval_result_file = save_result(
                 result=val_result,
                 result_dir=self.args.output_dir + self.task_name + "/result",
-                filename="{}_epoch{}".format('eval', 'eval'),
+                filename="{}_epoch{}".format("eval", "eval"),
                 remove_duplicate="image_id",
-                world_size=self.args.world_size)
+                world_size=self.args.world_size,
+            )
 
-            metrics = self._report_metrics_caption(
-                eval_result_file=eval_result_file)
+            metrics = self._report_metrics_caption(eval_result_file=eval_result_file)
         elif "vqa" in self.task_name:
             eval_result_file = save_result(
                 val_result,
                 result_dir=self.args.output_dir + self.task_name + "/result",
-                filename="{}_epoch{}".format('eval', 'eval'),
-                remove_duplicate="question_id", )
+                filename="{}_epoch{}".format("eval", "eval"),
+                remove_duplicate="question_id",
+            )
 
-            metrics = self._report_metrics_vqa(
-                eval_result_file=eval_result_file)
+            metrics = self._report_metrics_vqa(eval_result_file=eval_result_file)
         else:
             raise NotImplementedError
         return metrics
@@ -562,7 +541,7 @@ def after_evaluation(self, val_result):
     def _report_metrics_caption(self, eval_result_file, split_name="test"):
 
         # TODO better way to define this
-        coco_gt_root = os.path.join('/root/.paddlemix/datasets/', "coco_gt")
+        coco_gt_root = os.path.join("/root/.paddlemix/datasets/", "coco_gt")
         coco_val = coco_caption_eval(coco_gt_root, eval_result_file, split_name)
 
         agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
@@ -579,12 +558,11 @@ def _report_metrics_caption(self, eval_result_file, split_name="test"):
     def _report_metrics_vqa(self, eval_result_file):
 
         metrics = {}
-        self.anno_files = '/root/.paddlemix/datasets/coco/annotations/v2_mscoco_val2014_annotations.json'
-        self.ques_files = '/root/.paddlemix/datasets/coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json'
+        self.anno_files = "/root/.paddlemix/datasets/coco/annotations/v2_mscoco_val2014_annotations.json"
+        self.ques_files = "/root/.paddlemix/datasets/coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json"
 
         vqa = VQA(self.anno_files, self.ques_files)
-        vqa_result = vqa.loadRes(
-            resFile=eval_result_file, quesFile=self.ques_files)
+        vqa_result = vqa.loadRes(resFile=eval_result_file, quesFile=self.ques_files)
         vqa_scorer = VQAEval(vqa, vqa_result, n=2)
         logger.info("Start VQA evaluation.")
         vqa_scorer.evaluate()
@@ -597,12 +575,10 @@ def _report_metrics_vqa(self, eval_result_file):
         logger.info("Per Answer Type Accuracy is the following:")
 
         for ans_type in vqa_scorer.accuracy["perAnswerType"]:
-            logger.info(
-                "%s : %.02f" %
-                (ans_type, vqa_scorer.accuracy["perAnswerType"][ans_type]))
+            logger.info("%s : %.02f" % (ans_type, vqa_scorer.accuracy["perAnswerType"][ans_type]))
             metrics[ans_type] = vqa_scorer.accuracy["perAnswerType"][ans_type]
 
         with open(os.path.join(self.args.output_dir, "evaluate.txt"), "a") as f:
             f.write(json.dumps(metrics) + "\n")
 
-        return metrics
\ No newline at end of file
+        return metrics
diff --git a/paddlemix/trainer/trainer.py b/paddlemix/trainer/trainer.py
index 8b2e44bd86de8..25570303f64bc 100644
--- a/paddlemix/trainer/trainer.py
+++ b/paddlemix/trainer/trainer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
+
 import paddle
 from paddle.io import DataLoader
 from paddlenlp.trainer.trainer import Trainer
@@ -80,10 +80,8 @@ def training_step(self, model, inputs) -> paddle.Tensor:
         if self.rank == 0 and self.args.tensorboard:
             self.logstep += 1
             self.writer.add_scalar("train/loss", loss.item(), self.logstep)
-            self.writer.add_scalar("train/grad_norm",
-                                   grad_norms.item(), self.logstep)
-            self.writer.add_scalar("train/logit_scale",
-                                   logit_scale.item(), self.logstep)
+            self.writer.add_scalar("train/grad_norm", grad_norms.item(), self.logstep)
+            self.writer.add_scalar("train/logit_scale", logit_scale.item(), self.logstep)
 
         return loss.detach()
 
@@ -103,4 +101,5 @@ def get_train_dataloader(self):
             collate_fn=self.data_collator,
             num_workers=self.args.dataloader_num_workers,
             prefetch_factor=1,
-            shuffle=False, )
+            shuffle=False,
+        )
diff --git a/paddlemix/utils/downloader.py b/paddlemix/utils/downloader.py
index 08f4139832d1c..f1659ec9225fa 100644
--- a/paddlemix/utils/downloader.py
+++ b/paddlemix/utils/downloader.py
@@ -30,15 +30,18 @@
 from huggingface_hub.utils import EntryNotFoundError
 from tqdm.auto import tqdm
 
-from .env import (DOWNLOAD_SERVER, FAILED_STATUS, HF_CACHE_HOME, MODEL_HOME,
-                  SUCCESS_STATUS)
+from .env import (
+    DOWNLOAD_SERVER,
+    FAILED_STATUS,
+    HF_CACHE_HOME,
+    MODEL_HOME,
+    SUCCESS_STATUS,
+)
 from .log import logger
 
 __all__ = ["get_weights_path_from_url", "resolve_cache_dir"]
 
-COMMUNITY_MODEL_PREFIX = os.getenv(
-    "COMMUNITY_MODEL_PREFIX",
-    "https://bj.bcebos.com/paddlenlp/models/community")
+COMMUNITY_MODEL_PREFIX = os.getenv("COMMUNITY_MODEL_PREFIX", "https://bj.bcebos.com/paddlenlp/models/community")
 WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 DOWNLOAD_RETRY_LIMIT = 3
 DOWNLOAD_CHECK = False
@@ -111,11 +114,12 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
 
 
 def get_path_from_url_with_filelock(
-        url: str,
-        root_dir: str,
-        md5sum: Optional[str]=None,
-        check_exist: bool=True,
-        timeout: float=-1, ) -> str:
+    url: str,
+    root_dir: str,
+    md5sum: Optional[str] = None,
+    check_exist: bool = True,
+    timeout: float = -1,
+) -> str:
     """construct `get_path_from_url` for `model_utils` to enable downloading multiprocess-safe
 
     Args:
@@ -140,8 +144,7 @@ def get_path_from_url_with_filelock(
     os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
 
     with FileLock(lock_file_path, timeout=timeout):
-        result = get_path_from_url(
-            url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist)
+        result = get_path_from_url(url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist)
     return result
 
 
@@ -161,15 +164,13 @@ def _download(url, path, md5sum=None):
         if retry_cnt < DOWNLOAD_RETRY_LIMIT:
             retry_cnt += 1
         else:
-            raise RuntimeError("Download from {} failed. "
-                               "Retry limit reached".format(url))
+            raise RuntimeError("Download from {} failed. " "Retry limit reached".format(url))
 
         logger.info("Downloading {} from {}".format(fname, url))
 
         req = requests.get(url, stream=True)
         if req.status_code != 200:
-            raise RuntimeError("Downloading from {} failed with code "
-                               "{}!".format(url, req.status_code))
+            raise RuntimeError("Downloading from {} failed with code " "{}!".format(url, req.status_code))
 
         # For protecting download interupted, download to
         # tmp_fullname firstly, move tmp_fullname to fullname
@@ -178,11 +179,7 @@ def _download(url, path, md5sum=None):
         total_size = req.headers.get("content-length")
         with open(tmp_fullname, "wb") as f:
             if total_size:
-                with tqdm(
-                        total=int(total_size),
-                        unit="B",
-                        unit_scale=True,
-                        unit_divisor=1024) as pbar:
+                with tqdm(total=int(total_size), unit="B", unit_scale=True, unit_divisor=1024) as pbar:
                     for chunk in req.iter_content(chunk_size=1024):
                         f.write(chunk)
                         pbar.update(len(chunk))
@@ -207,8 +204,7 @@ def _md5check(fullname, md5sum=None):
     calc_md5sum = md5.hexdigest()
 
     if calc_md5sum != md5sum:
-        logger.info("File {} md5 check failed, {}(calc) != "
-                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        logger.info("File {} md5 check failed, {}(calc) != " "{}(base)".format(fullname, calc_md5sum, md5sum))
         return False
     return True
 
@@ -425,10 +421,11 @@ def url_file_exists(url: str) -> bool:
 
 
 def hf_file_exists(
-        repo_id: str,
-        filename: str,
-        token: Union[bool, str, None]=None,
-        subfolder: Optional[str]=None, ) -> bool:
+    repo_id: str,
+    filename: str,
+    token: Union[bool, str, None] = None,
+    subfolder: Optional[str] = None,
+) -> bool:
     """Check whether the HF file exists
 
     Args:
@@ -447,16 +444,18 @@ def hf_file_exists(
     try:
         _ = get_hf_file_metadata(
             url=url,
-            token=token, )
+            token=token,
+        )
         return True
     except EntryNotFoundError:
         return False
 
 
 def resolve_cache_dir(
-        pretrained_model_name_or_path: str,
-        from_hf_hub: bool,
-        cache_dir: Optional[str]=None, ) -> str:
+    pretrained_model_name_or_path: str,
+    from_hf_hub: bool,
+    cache_dir: Optional[str] = None,
+) -> str:
     """resolve cache dir for PretrainedModel and PretrainedConfig
 
     Args:
diff --git a/paddlemix/utils/env.py b/paddlemix/utils/env.py
index 83a2f55c9cac0..0251734197348 100644
--- a/paddlemix/utils/env.py
+++ b/paddlemix/utils/env.py
@@ -39,9 +39,7 @@ def _get_ppmix_home():
             if os.path.isdir(home_path):
                 return home_path
             else:
-                raise RuntimeError(
-                    "The environment variable PPMIX_HOME {} is not a directory.".
-                    format(home_path))
+                raise RuntimeError("The environment variable PPMIX_HOME {} is not a directory.".format(home_path))
         else:
             return home_path
     return os.path.join(_get_user_home(), ".paddlemix")
@@ -108,8 +106,8 @@ def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0):
 def setdistenv(args):
     if dist.get_world_size() > 1:
         args.dp_degree = dist.get_world_size() // (
-            args.tensor_parallel_degree * args.sharding_parallel_degree *
-            args.pipeline_parallel_degree)
+            args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree
+        )
         strategy = fleet.DistributedStrategy()
         strategy.hybrid_configs = {
             "dp_degree": args.dp_degree,
@@ -131,10 +129,10 @@ def setdistenv(args):
         args.dp_rank = hcg.get_data_parallel_rank()
         args.sharding_rank = hcg.get_sharding_parallel_rank()
 
-        args.data_world_rank = (
-            args.dp_rank * args.sharding_parallel_degree + args.sharding_rank)
+        args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank
         args.data_world_size = dist.get_world_size() // abs(
-            args.tensor_parallel_degree * args.pipeline_parallel_degree)
+            args.tensor_parallel_degree * args.pipeline_parallel_degree
+        )
     else:
         args.data_world_rank = 0
         args.data_world_size = 1
diff --git a/paddlemix/utils/initializer.py b/paddlemix/utils/initializer.py
index 82777c8ad1f88..8a13c739977bb 100644
--- a/paddlemix/utils/initializer.py
+++ b/paddlemix/utils/initializer.py
@@ -136,9 +136,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False):
         Tuple[fan_in, fan_out]
     """
     if tensor.ndim < 2:
-        raise ValueError(
-            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
-        )
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
 
     if reverse:
         num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
@@ -191,8 +189,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False):
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
-        raise ValueError("Mode {} not supported, please use one of {}".format(
-            mode, valid_modes))
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
 
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
 
@@ -218,13 +215,11 @@ def _calculate_gain(nonlinearity, param=None):
     elif nonlinearity == "leaky_relu":
         if param is None:
             negative_slope = 0.01
-        elif (not isinstance(param, bool) and isinstance(param, int) or
-              isinstance(param, float)):
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
             # True/False are instances of int, hence check above
             negative_slope = param
         else:
-            raise ValueError("negative_slope {} not a valid number".format(
-                param))
+            raise ValueError("negative_slope {} not a valid number".format(param))
         return math.sqrt(2.0 / (1 + negative_slope**2))
     elif nonlinearity == "selu":
         return 3.0 / 4
@@ -232,11 +227,7 @@ def _calculate_gain(nonlinearity, param=None):
         raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
 
 
-def kaiming_uniform_(tensor,
-                     a=0,
-                     mode="fan_in",
-                     nonlinearity="leaky_relu",
-                     reverse=False):
+def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
     """
     Modified tensor inspace using kaiming_uniform method
     Args:
@@ -254,11 +245,7 @@ def kaiming_uniform_(tensor,
     return _no_grad_uniform_(tensor, -k, k)
 
 
-def kaiming_normal_(tensor,
-                    a=0,
-                    mode="fan_in",
-                    nonlinearity="leaky_relu",
-                    reverse=False):
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
     """
     Modified tensor inspace using kaiming_normal_
     Args:
@@ -306,8 +293,7 @@ def reset_initialized_parameter(model, include_self=True):
     """
     for _, m in model.named_sublayers(include_self=include_self):
         if isinstance(m, nn.Conv2D):
-            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
-                                    m._kernel_size[1])
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1])
             k = math.sqrt(k)
             _no_grad_uniform_(m.weight, -k, k)
             if hasattr(m, "bias") and getattr(m, "bias") is not None:
@@ -343,13 +329,11 @@ def _transform(t, device, dtype, blocking):
         size_dtype = core.size_of_dtype(dtype)
         # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
         # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
-        waiting_alloc_memory = (
-            (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+        waiting_alloc_memory = ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
         gpu_memory_available = core.gpu_memory_available()
         if gpu_memory_available < waiting_alloc_memory:
             # Copy param / Tensor to cpu
-            t_used = t._copy_to(paddle.CPUPlace(),
-                                blocking)  # k-v type will error
+            t_used = t._copy_to(paddle.CPUPlace(), blocking)  # k-v type will error
             # Release mem of t
             t.value().get_tensor()._clear()
         else:
@@ -379,11 +363,12 @@ def _transform(t, device, dtype, blocking):
 
 
 def to(
-        self,
-        device=None,
-        dtype=None,
-        blocking=None,
-        floating_only=True, ):
+    self,
+    device=None,
+    dtype=None,
+    blocking=None,
+    floating_only=True,
+):
     """
     Cast the parameters and buffers of Layer by the give device, dtype and blocking.
 
@@ -411,24 +396,25 @@ def to(
         if isinstance(device, str):
             device = paddle.device._convert_to_place(device)
         elif isinstance(
-                device,
+            device,
             (
                 core.CPUPlace,
                 core.CUDAPlace,
                 core.CUDAPinnedPlace,
-                core.XPUPlace, ), ):
+                core.XPUPlace,
+            ),
+        ):
             pass
         else:
             raise ValueError(
                 "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
-                + type(device).__name__)
+                + type(device).__name__
+            )
 
     if blocking is None:
         blocking = True
     else:
-        assert isinstance(
-            blocking,
-            bool), "blocking value error, must be the True, False or None"
+        assert isinstance(blocking, bool), "blocking value error, must be the True, False or None"
 
     def transform(t, device, dtype, blocking):
         if floating_only and (not paddle.is_floating_point(t)):
diff --git a/paddlemix/utils/log.py b/paddlemix/utils/log.py
index b951c826b6321..97f7a407c70dd 100644
--- a/paddlemix/utils/log.py
+++ b/paddlemix/utils/log.py
@@ -23,34 +23,13 @@
 loggers = {}
 
 log_config = {
-    "DEBUG": {
-        "level": 10,
-        "color": "purple"
-    },
-    "INFO": {
-        "level": 20,
-        "color": "green"
-    },
-    "TRAIN": {
-        "level": 21,
-        "color": "cyan"
-    },
-    "EVAL": {
-        "level": 22,
-        "color": "blue"
-    },
-    "WARNING": {
-        "level": 30,
-        "color": "yellow"
-    },
-    "ERROR": {
-        "level": 40,
-        "color": "red"
-    },
-    "CRITICAL": {
-        "level": 50,
-        "color": "bold_red"
-    },
+    "DEBUG": {"level": 10, "color": "purple"},
+    "INFO": {"level": 20, "color": "green"},
+    "TRAIN": {"level": 21, "color": "cyan"},
+    "EVAL": {"level": 22, "color": "blue"},
+    "WARNING": {"level": 30, "color": "yellow"},
+    "ERROR": {"level": 40, "color": "red"},
+    "CRITICAL": {"level": 50, "color": "bold_red"},
 }
 
 
@@ -62,22 +41,19 @@ class Logger(object):
         name(str) : Logger name, default is 'PaddleNLP'
     """
 
-    def __init__(self, name: str=None):
+    def __init__(self, name: str = None):
         name = "PaddleMIX" if not name else name
         self.logger = logging.getLogger(name)
 
         for key, conf in log_config.items():
             logging.addLevelName(conf["level"], key)
             self.__dict__[key] = functools.partial(self.__call__, conf["level"])
-            self.__dict__[key.lower()] = functools.partial(self.__call__,
-                                                           conf["level"])
+            self.__dict__[key.lower()] = functools.partial(self.__call__, conf["level"])
 
         self.format = colorlog.ColoredFormatter(
             "%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s",
-            log_colors={
-                key: conf["color"]
-                for key, conf in log_config.items()
-            }, )
+            log_colors={key: conf["color"] for key, conf in log_config.items()},
+        )
 
         self.handler = logging.StreamHandler()
         self.handler.setFormatter(self.format)
@@ -95,8 +71,7 @@ def enable(self):
         self._is_enable = True
 
     def set_level(self, log_level: str):
-        assert (log_level in log_config
-                ), f"Invalid log level. Choose among {log_config.keys()}"
+        assert log_level in log_config, f"Invalid log level. Choose among {log_config.keys()}"
         self.logger.setLevel(log_level)
 
     @property
@@ -117,7 +92,7 @@ def use_terminator(self, terminator: str):
         self.handler.terminator = old_terminator
 
     @contextlib.contextmanager
-    def processing(self, msg: str, interval: float=0.1):
+    def processing(self, msg: str, interval: float = 0.1):
         """
         Continuously print a progress bar with rotating special effects.
 
diff --git a/paddlemix/utils/parameters.py b/paddlemix/utils/parameters.py
index 1e34840804b4c..e6a68aa0010db 100644
--- a/paddlemix/utils/parameters.py
+++ b/paddlemix/utils/parameters.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import paddle
 
 
@@ -19,20 +20,19 @@ def transfer_param(p, is_bias=False, dtype="float16", restore_data=False):
     param_shape = p.shape
     # Allow CPU/GPU and float16/float32 transfer
     # NOTE: str(p.place) differs between paddle develop and 2.2
-    if str(p.dtype)[-len(dtype):] == dtype and ("gpu" in str(p.place).lower() or
-                                                "cuda" in str(p.place).lower()):
+    if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()):
         return p
     if restore_data:
-        if (getattr(paddle.fluid.framework, "_in_eager_mode_", False) and
-                getattr(paddle.fluid.framework, "_dygraph_tracer_", None) is
-                not None) or (hasattr(paddle.fluid.framework, "global_var") and
-                              getattr(paddle.fluid.framework.global_var,
-                                      "_in_eager_mode_", False) and
-                              getattr(paddle.fluid.framework.global_var,
-                                      "_dygraph_tracer_", None) is not None):
+        if (
+            getattr(paddle.fluid.framework, "_in_eager_mode_", False)
+            and getattr(paddle.fluid.framework, "_dygraph_tracer_", None) is not None
+        ) or (
+            hasattr(paddle.fluid.framework, "global_var")
+            and getattr(paddle.fluid.framework.global_var, "_in_eager_mode_", False)
+            and getattr(paddle.fluid.framework.global_var, "_dygraph_tracer_", None) is not None
+        ):
             param_data = p.numpy()
-            new_p = paddle.create_parameter(
-                shape=param_shape, dtype=dtype, is_bias=is_bias)
+            new_p = paddle.create_parameter(shape=param_shape, dtype=dtype, is_bias=is_bias)
             new_p.set_value(param_data.astype(dtype))
             return new_p
         elif paddle.in_dynamic_mode():
@@ -42,16 +42,13 @@ def transfer_param(p, is_bias=False, dtype="float16", restore_data=False):
             # elaborately to get a ParamBase. Also note `VarBase.set_value`
             # enforce the same dtype and can not be used directly.
             new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias)
-            new_p.value().get_tensor().set(
-                param_data.astype(dtype),
-                paddle.framework._current_expected_place())
+            new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place())
             return new_p
         else:
-            param_data = np.array(paddle.static.global_scope().find_var(p.name)
-                                  .get_tensor())
+            param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor())
     return paddle.create_parameter(
         shape=param_shape,
         dtype=dtype,
         is_bias=is_bias,
-        default_initializer=paddle.nn.initializer.Assign(param_data)
-        if restore_data else None, )
+        default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None,
+    )
diff --git a/paddlevlp/datasets/dataset.py b/paddlevlp/datasets/dataset.py
deleted file mode 100644
index 96452fb68de78..0000000000000
--- a/paddlevlp/datasets/dataset.py
+++ /dev/null
@@ -1,1136 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import atexit
-import inspect
-import os
-import time
-import warnings
-from collections import namedtuple
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
-
-import datasets
-from multiprocess import Pool, RLock
-from PIL import Image
-
-import paddlemix
-
-try:
-    import paddle.distributed as dist
-except Exception:
-    warnings.warn("paddle.distributed is not contains in you paddle!")
-
-import importlib
-from functools import partial
-
-from paddle.io import Dataset, IterableDataset
-from paddle.utils.download import _get_unique_endpoints
-
-from paddlemix.utils.env import DATA_HOME
-
-__all__ = ["MapDataset", "DatasetBuilder", "IterDataset", "load_dataset"]
-
-DATASETS_MODULE_PATH = "paddlemix.datasets."
-
-# Patch for intranet
-from datasets import load_dataset as origin_load_dataset  # noqa: E402
-
-
-def load_from_ppvlp(path, *args, **kwargs):
-    ppvlp_path = paddlemix.datasets.__path__[0]
-    new_path = os.path.split(path)[-1]
-    new_path = os.path.join(ppvlp_path, "hf_datasets", new_path + ".py")
-    if os.path.exists(new_path):
-        return origin_load_dataset(new_path, *args, **kwargs)
-    else:
-        return origin_load_dataset(path, *args, **kwargs)
-
-
-datasets.load_dataset = load_from_ppvlp
-
-
-class DatasetTuple:
-    def __init__(self, splits):
-        self.identifier_map, identifiers = self._gen_identifier_map(splits)
-        self.tuple_cls = namedtuple("datasets", identifiers)
-        self.tuple = self.tuple_cls(* [None for _ in splits])
-
-    def __getitem__(self, key):
-        if isinstance(key, (int, slice)):
-            return self.tuple[key]
-        if isinstance(key, str):
-            return getattr(self.tuple, self.identifier_map[key])
-
-    def __setitem__(self, key, value):
-        self.tuple = self.tuple._replace(**{self.identifier_map[key]: value})
-
-    def _gen_identifier_map(self, splits):
-        identifier_map = {}
-        identifiers = []
-        for i in range(len(splits)):
-            identifiers.append("splits_" + str(i))
-            identifier_map[splits[i]] = "splits_" + str(i)
-        return identifier_map, identifiers
-
-    def __len__(self):
-        return len(self.tuple)
-
-
-def import_main_class(module_path):
-    """
-    Import a module at module_path and return its DatasetBuilder class.
-
-    """
-    module_path = DATASETS_MODULE_PATH + module_path
-    module = importlib.import_module(module_path)
-    main_cls_type = DatasetBuilder
-
-    # Find the main class in our imported module
-    module_main_cls = None
-    for name, obj in module.__dict__.items():
-        if isinstance(obj, type) and issubclass(obj, main_cls_type):
-            if name == "DatasetBuilder":
-                continue
-            module_main_cls = obj
-            break
-
-    return module_main_cls
-
-
-def load_from_hf(path, name=None, splits=None, **kwargs):
-    from datasets import DatasetDict
-    from datasets import load_dataset as load_hf_dataset
-    from datasets.features import ClassLabel
-
-    try:
-        hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs)
-    except FileNotFoundError:
-        raise FileNotFoundError("Couldn't find the dataset script for '" + path
-                                + "' on PaddleNLP or HuggingFace")
-    else:
-        label_list = []
-        if isinstance(hf_datasets, DatasetDict):
-            datasets = DatasetTuple(list(hf_datasets.keys()))
-            for split, ds in hf_datasets.items():
-                for feature in ds.features.values():
-                    if isinstance(feature, ClassLabel):
-                        label_list = feature.names
-                datasets[split] = MapDataset(ds, label_list=label_list)
-        elif isinstance(hf_datasets, list):
-            datasets = DatasetTuple(splits)
-            for i, split in enumerate(splits):
-                for feature in hf_datasets[i].features.values():
-                    if isinstance(feature, ClassLabel):
-                        label_list = feature.names
-                datasets[split] = MapDataset(
-                    hf_datasets[i], label_list=label_list)
-        else:
-            for feature in hf_datasets.features.values():
-                if isinstance(feature, ClassLabel):
-                    label_list = feature.names
-            datasets = MapDataset(hf_datasets, label_list=label_list)
-    return datasets
-
-
-def load_dataset(path_or_read_func,
-                 name=None,
-                 data_files=None,
-                 splits=None,
-                 lazy=None,
-                 **kwargs):
-    """
-    This method will load a dataset, either form PaddleNLP library or from a
-    self-defined data loading script, by calling functions in `DatasetBuilder`.
-
-    For all the names of datasets in PaddleNLP library, see here:  `dataset_list
-    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_list.html>`__.
-
-    Either `splits` or `data_files` must be specified.
-
-    Args:
-        path_or_read_func (str|callable): Name of the dataset processing script
-            in PaddleNLP library or a custom data reading function.
-        name (str, optional): Additional name to select a more specific dataset.
-            Defaults to None.
-        data_files (str|list|tuple|dict, optional): Defining the path of dataset
-            files. If None. `splits` must be specified. Defaults to None.
-        splits (str|list|tuple, optional): Which split of the data to load. If None.
-            `data_files` must be specified. Defaults to None.
-        lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`.
-            True for `IterDataset`. False for `MapDataset`. If None, return the
-            default type of this dataset. Defaults to None.
-        kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`.
-
-    Returns:
-        A `MapDataset` or `IterDataset` or a tuple of those.
-
-    For how to use this function, please see `dataset_load
-    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html>`__
-    and `dataset_self_defined
-    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__
-
-    """
-    if inspect.isfunction(path_or_read_func):
-        assert lazy is not None, "lazy can not be None in custom mode."
-        kwargs["name"] = name
-        kwargs["data_files"] = data_files
-        kwargs["splits"] = splits
-        custom_kwargs = {}
-        for name in inspect.signature(path_or_read_func).parameters.keys():
-            if name in kwargs.keys():
-                custom_kwargs[name] = kwargs[name]
-
-        reader_instance = SimpleBuilder(lazy=lazy, read_func=path_or_read_func)
-        return reader_instance.read(**custom_kwargs)
-    else:
-        try:
-            reader_cls = import_main_class(path_or_read_func)
-        except ModuleNotFoundError:
-            datasets = load_from_hf(
-                path_or_read_func, name=name, splits=splits, **kwargs)
-        else:
-            reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)
-
-            # Check if selected name and split is valid in this DatasetBuilder
-            if hasattr(reader_instance, "BUILDER_CONFIGS"):
-                if name in reader_cls.BUILDER_CONFIGS.keys():
-                    split_names = reader_cls.BUILDER_CONFIGS[name][
-                        "splits"].keys()
-                else:
-                    raise ValueError(
-                        'Invalid name "{}". Should be one of {}.'.format(
-                            name, list(reader_cls.BUILDER_CONFIGS.keys())))
-            elif hasattr(reader_instance, "SPLITS"):
-                split_names = reader_instance.SPLITS.keys()
-            else:
-                raise AttributeError(
-                    "Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder."
-                )
-
-            selected_splits = []
-            if isinstance(splits, list) or isinstance(splits, tuple):
-                selected_splits.extend(splits)
-            else:
-                selected_splits += [splits]
-
-            for split_name in selected_splits:
-                if split_name not in split_names and split_name is not None:
-                    raise ValueError('Invalid split "{}". Should be one of {}.'.
-                                     format(split_name, list(split_names)))
-
-            datasets = reader_instance.read_datasets(
-                data_files=data_files, splits=splits)
-        return datasets
-
-
-class MapDataset(Dataset):
-    """
-    Wraps a map-style dataset-like object as an instance of `MapDataset`, and equips it
-    with `map` and other utility methods. All non-magic methods of the raw object
-    are also accessible.
-
-    Args:
-        data (list|Dataset): An object with `__getitem__` and `__len__` methods. It could
-            be a list or a subclass of `paddle.io.Dataset`.
-        kwargs (dict, optional): Other information to be passed to the dataset.
-
-    For examples of this class, please see `dataset_self_defined
-    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
-
-    """
-
-    def __init__(self, data, **kwargs):
-        self.data = data
-        self._transform_pipline = []
-        self.new_data = self.data
-        self.info = kwargs
-        self.label_list = self.info.pop("label_list", None)
-        self.vocab_info = self.info.pop("vocab_info", None)
-
-    def _transform(self, data):
-        for fn in self._transform_pipline:
-            data = fn(data)
-        return data
-
-    def __getitem__(self, idx):
-        """
-        Basic function of `MapDataset` to get sample from dataset with a given
-        index.
-        """
-        return (self._transform(self.new_data[idx])
-                if self._transform_pipline else self.new_data[idx])
-
-    def __len__(self):
-        """
-        Returns the number of samples in dataset.
-        """
-        return len(self.new_data)
-
-    def filter(self, fn, num_workers=0):
-        """
-        Filters samples by the filter function and uses the filtered data to
-        update this dataset.
-
-        Args:
-            fn (callable): A filter function that takes a sample as input and
-                returns a boolean. Samples that return False would be discarded.
-            num_workers(int, optional): Number of processes for multiprocessing. If
-                set to 0, it doesn't use multiprocessing. Defaults to `0`.
-        """
-        assert num_workers >= 0, "num_workers should be a non-negative value"
-        if num_workers > 1:
-            shards = [
-                self._shard(
-                    num_shards=num_workers, index=index, contiguous=True)
-                for index in range(num_workers)
-            ]
-            kwds_per_shard = [
-                dict(
-                    self=shards[rank], fn=fn) for rank in range(num_workers)
-            ]
-            pool = Pool(num_workers, initargs=(RLock(), ))
-
-            results = [
-                pool.apply_async(
-                    self.__class__._filter, kwds=kwds)
-                for kwds in kwds_per_shard
-            ]
-            transformed_shards = [r.get() for r in results]
-
-            pool.close()
-            pool.join()
-            self.new_data = []
-            for i in range(num_workers):
-                self.new_data += transformed_shards[i].new_data
-            return self
-        else:
-            return self._filter(fn)
-
-    def _filter(self, fn):
-        self.new_data = [
-            self.new_data[idx] for idx in range(len(self.new_data))
-            if fn(self.new_data[idx])
-        ]
-        return self
-
-    def shard(self, num_shards=None, index=None, contiguous=False):
-        self.new_data = self._shard(
-            num_shards=num_shards, index=index, contiguous=contiguous).data
-        return self
-
-    def _shard(self, num_shards=None, index=None, contiguous=False):
-        """
-        Split the dataset into `num_shards` pieces. Note that the size of each
-        shard might be different because the original dataset may not be evenly
-        divisible.
-
-        Args:
-            num_shards (int, optional): An integer representing the number of
-                data shards. If None, `num_shards` would be number of trainers.
-                Defaults to `None`.
-            index (int, optional): An integer representing the index of the
-                current shard. If None, `index` would be the current trainer rank
-                id. Defaults to `None`.
-            contiguous: (bool, optional): If true, contiguous chunks of data
-                will be select for sharding. And total number of examples will
-                be the same. Otherwise each shard will contain all examples of
-                dataset whose index mod `num_shards` = `index`. Defaults to `False`.
-        """
-        if num_shards is None:
-            num_shards = dist.get_world_size()
-        if index is None:
-            index = dist.get_rank()
-
-        if contiguous:
-            div = len(self) // num_shards
-            mod = len(self) % num_shards
-            start = div * index + min(index, mod)
-            end = start + div + (1 if index < mod else 0)
-            new_data = [self.new_data[idx] for idx in range(start, end)]
-        else:
-            new_data = [
-                self.new_data[idx] for idx in range(len(self.new_data))
-                if idx % num_shards == index
-            ]
-
-        return MapDataset(new_data)
-
-    def map(self, fn, lazy=True, batched=False, num_workers=0):
-        """
-        Performs specific function on the dataset to transform and update every sample.
-
-        Args:
-            fn (callable): Transformations to be performed. It receives single
-                sample as argument if batched is False. Else it receives all examples.
-            lazy (bool, optional): If True, transformations would be delayed and
-                performed on demand. Otherwise, transforms all samples at once. Note that
-                if `fn` is stochastic, `lazy` should be True or you will get the same
-                result on all epochs. Defaults to False.
-            batched(bool, optional): If True, transformations would take all examples as
-                input and return a collection of transformed examples. Note that if set
-                True, `lazy` option would be ignored. Defaults to False.
-            num_workers(int, optional): Number of processes for multiprocessing. If
-                set to 0, it doesn't use multiprocessing. Note that if set to positive
-                value, `lazy` option would be ignored. Defaults to 0.
-        """
-
-        assert num_workers >= 0, "num_workers should be a non-negative value"
-        if num_workers > 1:
-            shards = [
-                self._shard(
-                    num_shards=num_workers, index=index, contiguous=True)
-                for index in range(num_workers)
-            ]
-            kwds_per_shard = [
-                dict(
-                    self=shards[rank], fn=fn, lazy=False, batched=batched)
-                for rank in range(num_workers)
-            ]
-            pool = Pool(num_workers, initargs=(RLock(), ))
-            results = [
-                pool.apply_async(
-                    self.__class__._map, kwds=kwds) for kwds in kwds_per_shard
-            ]
-            transformed_shards = [r.get() for r in results]
-            pool.close()
-            pool.join()
-            self.new_data = []
-            for i in range(num_workers):
-                self.new_data += transformed_shards[i].new_data
-            return self
-        else:
-            return self._map(fn, lazy=lazy, batched=batched)
-
-    def _map(self, fn, lazy=True, batched=False):
-        if batched:
-            self.new_data = fn(self.new_data)
-        elif lazy:
-            self._transform_pipline.append(fn)
-        else:
-            self.new_data = [
-                fn(self.new_data[idx]) for idx in range(len(self.new_data))
-            ]
-        return self
-
-
-class IterDataset(IterableDataset):
-    """
-    Wraps a dataset-like object as an instance of `IterDataset`, and equips it with
-    `map` and other utility methods. All non-magic methods of the raw object
-    also accessible.
-
-    Args:
-        data (Iterable): An object with `__iter__` function. It can be a Iterable or a
-            subclass of `paddle.io.IterableDataset`.
-        kwargs (dict, optional): Other information to be passed to the dataset.
-
-    For examples of this class, please see `dataset_self_defined
-    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
-    """
-
-    def __init__(self, data, **kwargs):
-        self.data = data
-        self._transform_pipline = []
-        self._filter_pipline = []
-
-        self.label_list = kwargs.pop("label_list", None)
-        self.vocab_info = kwargs.pop("vocab_info", None)
-
-    def _transform(self, data):
-        for fn in self._transform_pipline:
-            data = fn(data)
-        return data
-
-    def _shard_filter(self, num_samples):
-        return True
-
-    def _filter(self, data):
-        for fn in self._filter_pipline:
-            if not fn(data):
-                return False
-        return True
-
-    def __iter__(self):
-        """
-        yields sample sequentially.
-        """
-        num_samples = 0
-        if inspect.isfunction(self.data):
-            for example in self.data():
-                if (not self._filter_pipline or
-                        self._filter(self._filter_pipline)
-                    ) and self._shard_filter(num_samples=num_samples):
-                    yield self._transform(
-                        example) if self._transform_pipline else example
-                num_samples += 1
-        else:
-            if inspect.isgenerator(self.data):
-                warnings.warn(
-                    "Reciving generator as data source, data can only be iterated once"
-                )
-            for example in self.data:
-                if (not self._filter_pipline or
-                        self._filter(self._filter_pipline)
-                    ) and self._shard_filter(num_samples=num_samples):
-                    yield self._transform(
-                        example) if self._transform_pipline else example
-                num_samples += 1
-
-    def filter(self, fn):
-        """
-        Filters samples by the filter function and uses the filtered data to
-        update this dataset.
-
-        Args:
-            fn (callable): A filter function that takes a sample as input and
-                returns a boolean. Samples that return False are discarded.
-        """
-
-        self._filter_pipline.append(fn)
-
-        return self
-
-    def shard(self, num_shards=None, index=None):
-        """
-        Split the dataset into `num_shards` pieces.
-
-        Args:
-            num_shards (int, optional): An integer representing the number of
-                data shards. If None, `num_shards` would be number of trainers.
-                Defaults to None.
-            index (int, optional): An integer representing the index of the
-                current shard. If None, `index` would be the current trainer rank
-                id. Defaults to None.
-        """
-        if num_shards is None:
-            num_shards = dist.get_world_size()
-        if index is None:
-            index = dist.get_rank()
-
-        def sharder(num_shards, index, num_samples):
-            if num_samples % num_shards == index:
-                return True
-            else:
-                return False
-
-        fn = partial(sharder, num_shards=num_shards, index=index)
-        self._shard_filter = fn
-        return self
-
-    def map(self, fn):
-        """
-        Performs specific function on the dataset to transform and update every sample.
-
-        Args:
-            fn (callable): Transformations to be performed. It receives single
-                sample as argument.
-        """
-
-        self._transform_pipline.append(fn)
-
-        return self
-
-
-class DatasetBuilder:
-    """
-    A base class for all DatasetBuilder. It provides a `read()` function to turn
-    a data file into a MapDataset or IterDataset.
-
-    `_get_data()` function and `_read()` function should be implemented to download
-    data file and read data file into a `Iterable` of the examples.
-
-    For how to define a custom `DatasetBuilder`, please see `contribute_dataset
-    <https://paddlenlp.readthedocs.io/zh/latest/community/contribute_dataset.html>`__.
-    """
-
-    lazy = False
-
-    def __init__(self, lazy=None, name=None, **config):
-        if lazy is not None:
-            self.lazy = lazy
-        self.name = name
-        self.config = config
-
-    def read_datasets(self, splits=None, data_files=None):
-        def remove_if_exit(filepath):
-            if isinstance(filepath, (list, tuple)):
-                for file in filepath:
-                    try:
-                        os.remove(file)
-                    except OSError:
-                        pass
-            else:
-                try:
-                    os.remove(filepath)
-                except OSError:
-                    pass
-
-        if data_files is None:
-            if splits is None:
-                splits = (list(self.BUILDER_CONFIGS[self.name]["splits"].keys())
-                          if hasattr(self, "BUILDER_CONFIGS") else
-                          list(self.SPLITS.keys()))
-
-            assert (
-                isinstance(splits, str) or
-                (isinstance(splits, list) and isinstance(splits[0], str)) or
-                (isinstance(splits, tuple) and isinstance(splits[0], str))
-            ), "`splits` should be a string or list of string or a tuple of string."
-
-            if isinstance(splits, str):
-                splits = [splits]
-            datasets = DatasetTuple(splits)
-            parallel_env = dist.ParallelEnv()
-            unique_endpoints = _get_unique_endpoints(
-                parallel_env.trainer_endpoints[:])
-            # move register hook to first and register togather
-            lock_files = []
-            for split in splits:
-                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
-                if self.name is not None:
-                    lock_file = lock_file + "." + self.name
-                lock_file += "." + split + ".done" + "." + str(os.getppid())
-                lock_files.append(lock_file)
-            # Must register to all procs to make the lock file can be removed
-            # when any proc breaks. Otherwise, the single registered proc may
-            # not receive proper singal send by the parent proc to exit.
-            atexit.register(lambda: remove_if_exit(lock_files))
-            for split in splits:
-                filename = self._get_data(split)
-                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
-                if self.name is not None:
-                    lock_file = lock_file + "." + self.name
-                lock_file += "." + split + ".done" + "." + str(os.getppid())
-                # `lock_file` indicates the finished status of`_get_data`.
-                # `_get_data` only works in the `unique_endpoints` specified
-                # proc since `get_path_from_url` only work for it. The other
-                # procs wait `_get_data` to be finished.
-                if parallel_env.current_endpoint in unique_endpoints:
-                    f = open(lock_file, "w")
-                    f.close()
-                else:
-                    while not os.path.exists(lock_file):
-                        time.sleep(1)
-                datasets[split] = self.read(filename=filename, split=split)
-        else:
-            assert (
-                isinstance(data_files, str) or isinstance(data_files, tuple) or
-                isinstance(data_files, list)
-            ), "`data_files` should be a string or tuple or list of strings."
-            if isinstance(data_files, str):
-                data_files = [data_files]
-            default_split = "train"
-            if splits:
-                if isinstance(splits, str):
-                    splits = [splits]
-                datasets = DatasetTuple(splits)
-                assert len(splits) == len(
-                    data_files
-                ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file."
-                for i in range(len(data_files)):
-                    datasets[splits[i]] = self.read(
-                        filename=data_files[i], split=splits[i])
-            else:
-                datasets = DatasetTuple(
-                    ["split" + str(i) for i in range(len(data_files))])
-                for i in range(len(data_files)):
-                    datasets["split" + str(i)] = self.read(
-                        filename=data_files[i], split=default_split)
-
-        return datasets if len(datasets) > 1 else datasets[0]
-
-    def read(self, filename, split="train"):
-        """
-        Returns a dataset containing all the examples that can be read from the file path.
-
-        If `self.lazy` is False, this eagerly reads all instances from `self._read()`
-        and returns a `MapDataset`.
-
-        If `self.lazy` is True, this returns an `IterDataset`, which internally
-        relies on the generator created from `self._read()` to lazily produce examples.
-        In this case your implementation of `_read()` must also be lazy
-        (that is, not load all examples into memory at once).
-
-        Args:
-            filename (str): Path of data file to read, usually provided by `_get_data`
-                function.
-            split (str, optional): The split name of selected dataset. This only makes
-                a different when data files of different splits have different structures.
-
-        Returns:
-            A `MapDataset|IterDataset`.
-        """
-
-        label_list = self.get_labels()
-        vocab_info = self.get_vocab()
-
-        def _create_dict(labels):
-            # For multiple labels in the form of list.
-            if isinstance(labels[0], list) or isinstance(labels[0], tuple):
-                label_dict = []
-                for sub_labels in labels:
-                    sub_dict = {}
-                    for i, label in enumerate(sub_labels):
-                        sub_dict[label] = i
-                    label_dict.append(sub_dict)
-            else:
-                label_dict = {}
-                for i, label in enumerate(labels):
-                    label_dict[label] = i
-            return label_dict
-
-        def _convert_label_to_id(labels, label_dict):
-            if isinstance(labels, list) or isinstance(labels, tuple):
-                for label_idx in range(len(labels)):
-                    labels[label_idx] = label_dict[labels[label_idx]]
-            else:
-                labels = label_dict[labels]
-            return labels
-
-        if self.lazy:
-
-            def generate_examples():
-                generator = (self._read(filename, split)
-                             if self._read.__code__.co_argcount > 2 else
-                             self._read(filename))
-                for example in generator:
-                    # We need to check if the example contains label column and confirm its name.
-                    # For now we only allow `label` or `labels` to be the name of label column.
-                    if "labels" in example.keys():
-                        label_col = "labels"
-                    elif "label" in example.keys():
-                        label_col = "label"
-                    else:
-                        label_col = None
-
-                    # Convert class label to label ids.
-                    if label_list is not None and example.get(label_col, None):
-                        label_dict = _create_dict(label_list)
-                        # For multiple labels in the form of list.
-                        if isinstance(label_dict, list):
-                            for idx, sub_dict in enumerate(label_dict):
-                                example[label_col][idx] = _convert_label_to_id(
-                                    example[label_col][idx], sub_dict)
-                        else:
-                            example[label_col] = _convert_label_to_id(
-                                example[label_col], label_dict)
-
-                        yield example
-                    else:
-                        yield example
-
-            return IterDataset(
-                generate_examples(),
-                label_list=label_list,
-                vocab_info=vocab_info)
-        else:
-            examples = (self._read(filename, split)
-                        if self._read.__code__.co_argcount > 2 else
-                        self._read(filename))
-
-            # Then some validation.
-            if not isinstance(examples, list):
-                examples = list(examples)
-
-            if not examples:
-                raise ValueError(
-                    "No instances were read from the given filepath {}. "
-                    "Is the path correct?".format(filename))
-
-            # We need to check if the example contains label column and confirm its name.
-            # For now we only allow `label` or `labels` to be the name of label column.
-            if "labels" in examples[0].keys():
-                label_col = "labels"
-            elif "label" in examples[0].keys():
-                label_col = "label"
-            else:
-                label_col = None
-
-            # Convert class label to label ids.
-            if label_list is not None and examples[0].get(label_col, None):
-                label_dict = _create_dict(label_list)
-                for idx in range(len(examples)):
-                    # For multiple labels in the form of list.
-                    if isinstance(label_dict, list):
-                        for i, sub_dict in enumerate(label_dict):
-                            examples[idx][label_col][i] = _convert_label_to_id(
-                                examples[idx][label_col][i], sub_dict)
-                    else:
-                        examples[idx][label_col] = _convert_label_to_id(
-                            examples[idx][label_col], label_dict)
-
-            return MapDataset(
-                examples, label_list=label_list, vocab_info=vocab_info)
-
-    def _read(self, filename: str, *args):
-        """
-        Reads examples from the given file_path and returns them as an
-        `Iterable` (which could be a list or a generator).
-
-        This method must be implemented in self-defined `DatasetBuilder`.
-        """
-        raise NotImplementedError
-
-    def _get_data(self, mode: str):
-        """
-        Downloads examples from the given URL and customized split
-        informations and returns a filepath.
-
-        This method must be implemented in self-defined `DatasetBuilder`.
-        """
-        raise NotImplementedError
-
-    def get_labels(self):
-        """
-        Returns list of class labels of the dataset if specified.
-        """
-        return None
-
-    def get_vocab(self):
-        """
-        Returns vocab file path of the dataset if specified.
-        """
-        return None
-
-
-class SimpleBuilder(DatasetBuilder):
-    def __init__(self, lazy, read_func):
-        self._read = read_func
-        self.lazy = lazy
-
-    def read(self, **kwargs):
-        if self.lazy:
-
-            def generate_examples():
-                generator = self._read(**kwargs)
-                for example in generator:
-                    yield example
-
-            return IterDataset(generate_examples)
-        else:
-            examples = self._read(**kwargs)
-            if hasattr(examples, "__len__") and hasattr(examples,
-                                                        "__getitem__"):
-                return MapDataset(examples)
-            else:
-                return MapDataset(list(examples))
-
-
-def has_file_allowed_extension(filename: str,
-                               extensions: Union[str, Tuple[str, ...]]) -> bool:
-    """Checks if a file is an allowed extension.
-
-    Args:
-        filename (string): path to a file
-        extensions (tuple of strings): extensions to consider (lowercase)
-
-    Returns:
-        bool: True if the filename ends with one of given extensions
-    """
-    return filename.lower().endswith(
-        extensions if isinstance(extensions, str) else tuple(extensions))
-
-
-def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
-    """Finds the class folders in a dataset.
-
-    See :class:`DatasetFolder` for details.
-    """
-    classes = sorted(
-        entry.name for entry in os.scandir(directory) if entry.is_dir())
-    if not classes:
-        raise FileNotFoundError(
-            f"Couldn't find any class folder in {directory}.")
-
-    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
-    return classes, class_to_idx
-
-
-def make_dataset(
-        directory: str,
-        class_to_idx: Optional[Dict[str, int]]=None,
-        extensions: Optional[Union[str, Tuple[str, ...]]]=None,
-        is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[Tuple[
-            str, int]]:
-    """Generates a list of samples of a form (path_to_sample, class).
-
-    See :class:`DatasetFolder` for details.
-
-    Note: The class_to_idx parameter is here optional and will use the logic of the ``find_classes`` function
-    by default.
-    """
-    directory = os.path.expanduser(directory)
-
-    if class_to_idx is None:
-        _, class_to_idx = find_classes(directory)
-    elif not class_to_idx:
-        raise ValueError(
-            "'class_to_index' must have at least one entry to collect any samples."
-        )
-
-    both_none = extensions is None and is_valid_file is None
-    both_something = extensions is not None and is_valid_file is not None
-    if both_none or both_something:
-        raise ValueError(
-            "Both extensions and is_valid_file cannot be None or not None at the same time"
-        )
-
-    if extensions is not None:
-
-        def is_valid_file(x: str) -> bool:
-            return has_file_allowed_extension(
-                x, extensions)  # type: ignore[arg-type]
-
-    is_valid_file = cast(Callable[[str], bool], is_valid_file)
-
-    instances = []
-    available_classes = set()
-    for target_class in sorted(class_to_idx.keys()):
-        class_index = class_to_idx[target_class]
-        target_dir = os.path.join(directory, target_class)
-        if not os.path.isdir(target_dir):
-            continue
-        for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
-            for fname in sorted(fnames):
-                path = os.path.join(root, fname)
-                if is_valid_file(path):
-                    item = path, class_index
-                    instances.append(item)
-
-                    if target_class not in available_classes:
-                        available_classes.add(target_class)
-
-    empty_classes = set(class_to_idx.keys()) - available_classes
-    if empty_classes:
-        msg = (
-            f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. "
-        )
-        if extensions is not None:
-            msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}"
-        raise FileNotFoundError(msg)
-
-    return instances
-
-
-class DatasetFolder(Dataset):
-    """A generic data loader.
-
-    This default directory structure can be customized by overriding the
-    :meth:`find_classes` method.
-
-    Args:
-        root (string): Root directory path.
-        loader (callable): A function to load a sample given its path.
-        extensions (tuple[string]): A list of allowed extensions.
-            both extensions and is_valid_file should not be passed.
-        transform (callable, optional): A function/transform that takes in
-            a sample and returns a transformed version.
-            E.g, ``transforms.RandomCrop`` for images.
-        target_transform (callable, optional): A function/transform that takes
-            in the target and transforms it.
-        is_valid_file (callable, optional): A function that takes path of a file
-            and check if the file is a valid file (used to check of corrupt files)
-            both extensions and is_valid_file should not be passed.
-
-     Attributes:
-        classes (list): List of the class names sorted alphabetically.
-        class_to_idx (dict): Dict with items (class_name, class_index).
-        samples (list): List of (sample path, class_index) tuples
-        targets (list): The class_index value for each image in the dataset
-    """
-
-    def __init__(
-            self,
-            root: str,
-            loader: Callable[[str], Any],
-            extensions: Optional[Tuple[str, ...]]=None,
-            transform: Optional[Callable]=None,
-            target_transform: Optional[Callable]=None,
-            is_valid_file: Optional[Callable[[str], bool]]=None, ) -> None:
-        # super().__init__(root, transform=transform, target_transform=target_transform)
-        # super().__init__()
-        self.root = root
-        self.transform = transform
-        self.target_transform = target_transform
-
-        classes, class_to_idx = self.find_classes(self.root)
-        samples = self.make_dataset(self.root, class_to_idx, extensions,
-                                    is_valid_file)
-
-        self.loader = loader
-        self.extensions = extensions
-
-        self.classes = classes
-        self.class_to_idx = class_to_idx
-        self.samples = samples
-        self.targets = [s[1] for s in samples]
-
-    @staticmethod
-    def make_dataset(
-            directory: str,
-            class_to_idx: Dict[str, int],
-            extensions: Optional[Tuple[str, ...]]=None,
-            is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[
-                Tuple[str, int]]:
-        """Generates a list of samples of a form (path_to_sample, class).
-
-        This can be overridden to e.g. read files from a compressed zip file instead of from the disk.
-
-        Args:
-            directory (str): root dataset directory, corresponding to ``self.root``.
-            class_to_idx (Dict[str, int]): Dictionary mapping class name to class index.
-            extensions (optional): A list of allowed extensions.
-                Either extensions or is_valid_file should be passed. Defaults to None.
-            is_valid_file (optional): A function that takes path of a file
-                and checks if the file is a valid file
-                (used to check of corrupt files) both extensions and
-                is_valid_file should not be passed. Defaults to None.
-
-        Raises:
-            ValueError: In case ``class_to_idx`` is empty.
-            ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.
-            FileNotFoundError: In case no valid file was found for any class.
-
-        Returns:
-            List[Tuple[str, int]]: samples of a form (path_to_sample, class)
-        """
-        if class_to_idx is None:
-            # prevent potential bug since make_dataset() would use the class_to_idx logic of the
-            # find_classes() function, instead of using that of the find_classes() method, which
-            # is potentially overridden and thus could have a different logic.
-            raise ValueError("The class_to_idx parameter cannot be None.")
-        return make_dataset(
-            directory,
-            class_to_idx,
-            extensions=extensions,
-            is_valid_file=is_valid_file)
-
-    def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
-        """Find the class folders in a dataset structured as follows::
-
-            directory/
-            ├── class_x
-            │   ├── xxx.ext
-            │   ├── xxy.ext
-            │   └── ...
-            │       └── xxz.ext
-            └── class_y
-                ├── 123.ext
-                ├── nsdf3.ext
-                └── ...
-                └── asd932_.ext
-
-        This method can be overridden to only consider
-        a subset of classes, or to adapt to a different dataset directory structure.
-
-        Args:
-            directory(str): Root directory path, corresponding to ``self.root``
-
-        Raises:
-            FileNotFoundError: If ``dir`` has no class folders.
-
-        Returns:
-            (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index.
-        """
-        return find_classes(directory)
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            tuple: (sample, target) where target is class_index of the target class.
-        """
-        path, target = self.samples[index]
-        sample = self.loader(path)
-        if self.transform is not None:
-            sample = self.transform(sample)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-
-        return sample, target
-
-    def __len__(self) -> int:
-        return len(self.samples)
-
-
-IMG_EXTENSIONS = (
-    ".jpg",
-    ".jpeg",
-    ".png",
-    ".ppm",
-    ".bmp",
-    ".pgm",
-    ".tif",
-    ".tiff",
-    ".webp", )
-
-
-def pil_loader(path: str) -> Image.Image:
-    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
-    with open(path, "rb") as f:
-        img = Image.open(f)
-        return img.convert("RGB")
-
-
-def default_loader(path: str) -> Any:
-    return pil_loader(path)
-
-
-class ImageFolder(DatasetFolder):
-    """A generic data loader where the images are arranged in this way by default: ::
-
-        root/dog/xxx.png
-        root/dog/xxy.png
-        root/dog/[...]/xxz.png
-
-        root/cat/123.png
-        root/cat/nsdf3.png
-        root/cat/[...]/asd932_.png
-
-    This class inherits from :class:`~torchvision.datasets.DatasetFolder` so
-    the same methods can be overridden to customize the dataset.
-
-    Args:
-        root (string): Root directory path.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        loader (callable, optional): A function to load an image given its path.
-        is_valid_file (callable, optional): A function that takes path of an Image file
-            and check if the file is a valid file (used to check of corrupt files)
-
-     Attributes:
-        classes (list): List of the class names sorted alphabetically.
-        class_to_idx (dict): Dict with items (class_name, class_index).
-        imgs (list): List of (image path, class_index) tuples
-    """
-
-    def __init__(
-            self,
-            root: str,
-            transform: Optional[Callable]=None,
-            target_transform: Optional[Callable]=None,
-            loader: Callable[[str], Any]=default_loader,
-            is_valid_file: Optional[Callable[[str], bool]]=None, ):
-        super().__init__(
-            root,
-            loader,
-            IMG_EXTENSIONS if is_valid_file is None else None,
-            transform=transform,
-            target_transform=target_transform,
-            is_valid_file=is_valid_file, )
-        self.imgs = self.samples
diff --git a/paddlevlp/models/blip2/eva_vit.py b/paddlevlp/models/blip2/eva_vit.py
deleted file mode 100644
index 5e6d2c50c950f..0000000000000
--- a/paddlevlp/models/blip2/eva_vit.py
+++ /dev/null
@@ -1,517 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import Callable
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-from paddle.distributed import fleet
-from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
-from paddle.nn.initializer import Constant, Normal, TruncatedNormal
-
-from paddlemix.models.blip2.configuration import Blip2VisionConfig
-from paddlemix.models.blip2.modeling import Blip2PretrainedModel
-# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-# reference: https://arxiv.org/abs/2010.11929
-from paddlemix.utils.log import logger
-
-trunc_normal_ = TruncatedNormal(std=0.02)
-normal_ = Normal
-zeros_ = Constant(value=0.0)
-ones_ = Constant(value=1.0)
-from paddle.distributed.fleet.utils import recompute
-
-
-def to_2tuple(x):
-    return tuple([x] * 2)
-
-
-def drop_path(x, drop_prob=0.0, training=False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
-    """
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
-    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
-    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
-    random_tensor = paddle.floor(random_tensor)  # binarize
-    output = x.divide(keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Layer):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
-
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-
-class Mlp(nn.Layer):
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            drop=0.0,
-            mp_degree=1, ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        if mp_degree > 1:
-            self.fc1 = fleet.meta_parallel.ColumnParallelLinear(
-                in_features,
-                hidden_features,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True, )
-            self.fc2 = fleet.meta_parallel.ColumnParallelLinear(
-                hidden_features,
-                out_features,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True, )
-        else:
-            self.fc1 = nn.Linear(in_features, hidden_features)
-            self.fc2 = nn.Linear(hidden_features, out_features)
-        self.mp_degree = mp_degree
-        self.act = act_layer()
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.fc2(x)
-        if self.mp_degree > 1:
-            with get_rng_state_tracker().rng_state("global_seed"):
-                x = self.drop(x)
-        else:
-            x = self.drop(x)
-        return x
-
-
-class Attention(nn.Layer):
-    def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=False,
-            qk_scale=None,
-            attn_drop=0.0,
-            proj_drop=0.0,
-            window_size=None,
-            mp_degree=1, ):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        if mp_degree > 1:
-            self.qkv = fleet.meta_parallel.ColumnParallelLinear(
-                dim,
-                dim * 3,
-                weight_attr=None,
-                has_bias=True,
-                gather_output=True)
-        else:
-            self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        if mp_degree > 1:
-            self.proj = fleet.meta_parallel.ColumnParallelLinear(
-                dim, dim, weight_attr=None, has_bias=True, gather_output=True)
-        else:
-            self.proj = nn.Linear(dim, dim)
-        self.mp_degree = mp_degree
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def _register_relative_position_index(
-            self,
-            window_size,
-            num_heads, ):
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1) + 3
-        self.relative_position_bias_table = self.create_parameter(
-            [self.num_relative_distance, num_heads],
-            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
-        coords_h = paddle.arange(window_size[0])
-        coords_w = paddle.arange(window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = (
-            coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        )  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.transpose(
-            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = paddle.zeros(
-            (window_size[0] * window_size[1] + 1, ) * 2,
-            dtype=relative_coords.dtype)
-        relative_position_index[1:, 1:] = relative_coords.sum(
-            -1)  # Wh*Ww, Wh*Ww
-        relative_position_index[0, 0:] = self.num_relative_distance - 3
-        relative_position_index[0:, 0] = self.num_relative_distance - 2
-        relative_position_index[0, 0] = self.num_relative_distance - 1
-
-        self.register_buffer("relative_position_index", relative_position_index)
-
-    def forward(self, x, rel_pos_bias=None):
-        # B= paddle.shape(x)[0]
-        N, C = x.shape[1:]
-        # if self.q_bias is not None:
-        #     qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
-        qkv = (self.qkv(x).reshape(
-            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
-                (2, 0, 3, 1, 4)))
-        # print(self.qkv.bias[2100])
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
-        if hasattr(self, "relative_position_bias_table"):
-            relative_position_bias = self.relative_position_bias_table[
-                self.relative_position_index.reshape([-1])].reshape([
-                    self.window_size[0] * self.window_size[1] + 1,
-                    self.window_size[0] * self.window_size[1] + 1,
-                    -1,
-                ])  # Wh*Ww,Wh*Ww,nH
-            relative_position_bias = relative_position_bias.transpose(
-                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
-            attn = attn + relative_position_bias.unsqueeze(0)
-
-        attn = nn.functional.softmax(attn, axis=-1)
-        if self.mp_degree > 1:
-            with get_rng_state_tracker().rng_state("global_seed"):
-                attn = self.attn_drop(attn)
-        else:
-            attn = self.attn_drop(attn)
-
-        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
-        x = self.proj(x)
-        if self.mp_degree > 1:
-            with get_rng_state_tracker().rng_state("global_seed"):
-                x = self.proj_drop(x)
-        else:
-            x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Layer):
-    def __init__(
-            self,
-            dim,
-            num_heads,
-            mlp_ratio=4.0,
-            qkv_bias=False,
-            qk_scale=None,
-            drop=0.0,
-            init_values=0.0,
-            attn_drop=0.0,
-            drop_path=0.0,
-            act_layer=nn.GELU,
-            norm_layer="nn.LayerNorm",
-            epsilon=1e-5,
-            window_size=None,
-            mp_degree=1, ):
-        super().__init__()
-        if isinstance(norm_layer, str):
-            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
-        elif isinstance(norm_layer, Callable):
-            self.norm1 = norm_layer(dim)
-        else:
-            raise TypeError(
-                "The norm_layer must be str or paddle.nn.layer.Layer class")
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-            window_size=window_size,
-            mp_degree=mp_degree, )
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path)
-        self.gamma_1 = None
-        self.gamma_2 = None
-        if isinstance(norm_layer, str):
-            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
-        elif isinstance(norm_layer, Callable):
-            self.norm2 = norm_layer(dim)
-        else:
-            raise TypeError(
-                "The norm_layer must be str or paddle.nn.layer.Layer class")
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-            mp_degree=mp_degree, )
-
-    def forward(self, x, rel_pos_bias=None):
-        if self.gamma_1 is not None:
-            x = x + self.drop_path(self.gamma_1 * self.attn(
-                self.norm1(x), rel_pos_bias=rel_pos_bias))
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        else:
-            x = x + self.drop_path(
-                self.attn(
-                    self.norm1(x), rel_pos_bias=rel_pos_bias))
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-class RelativePositionBias(nn.Layer):
-    def __init__(self, window_size, num_heads):
-        super().__init__()
-        self.window_size = window_size
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1) + 3
-        self.relative_position_bias_table = self.create_parameter(
-            [self.num_relative_distance, num_heads],
-            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
-        # cls to token & token 2 cls & cls to cls
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = paddle.arange(window_size[0])
-        coords_w = paddle.arange(window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = (
-            coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        )  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.transpose(
-            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = paddle.zeros(
-            (window_size[0] * window_size[1] + 1, ) * 2,
-            dtype=relative_coords.dtype)
-        relative_position_index[1:, 1:] = relative_coords.sum(
-            -1)  # Wh*Ww, Wh*Ww
-        relative_position_index[0, 0:] = self.num_relative_distance - 3
-        relative_position_index[0:, 0] = self.num_relative_distance - 2
-        relative_position_index[0, 0] = self.num_relative_distance - 1
-
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        # trunc_normal_(self.relative_position_bias_table, std=.02)
-
-    def forward(self):
-        relative_position_bias = self.relative_position_bias_table[
-            self.relative_position_index.reshape([-1])].reshape([
-                self.window_size[0] * self.window_size[1] + 1,
-                self.window_size[0] * self.window_size[1] + 1,
-                -1,
-            ])  # Wh*Ww,Wh*Ww,nH
-        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
-
-
-class PatchEmbed(nn.Layer):
-    """Image to Patch Embedding"""
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
-                                                        patch_size[0])
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.proj = nn.Conv2D(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert (
-            H == self.img_size[0] and W == self.img_size[1]
-        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-
-        x = self.proj(x).flatten(2).transpose((0, 2, 1))
-        return x
-
-
-class VisionTransformer(Blip2PretrainedModel):
-    """Vision Transformer with support for patch input"""
-
-    main_input_name = "pixel_values"
-    config_class = Blip2VisionConfig
-
-    def __init__(self, config: Blip2VisionConfig, **kwargs):
-        super().__init__(config)
-        from paddle.distributed import fleet
-
-        mp_degree = fleet.DistributedStrategy().hybrid_configs["mp_degree"]
-        self.class_num = config.class_num
-        self.num_features = self.embed_dim = config.embed_dim
-        _img_size = to_2tuple(config.img_size)
-        _patch_size = to_2tuple(config.patch_size)
-        self.window_size = (
-            _img_size[0] // _patch_size[0],
-            _img_size[1] // _patch_size[1], )
-        self.patch_embed = PatchEmbed(
-            img_size=config.img_size,
-            patch_size=config.patch_size,
-            in_chans=config.in_chans,
-            embed_dim=config.embed_dim, )
-        num_patches = self.patch_embed.num_patches
-        self.cls_token = self.create_parameter(
-            shape=(1, 1, config.embed_dim), default_initializer=zeros_)
-
-        self.pos_embed = self.create_parameter(
-            shape=(1, num_patches + 1, config.embed_dim),
-            default_initializer=zeros_)
-
-        self.add_parameter("pos_embed", self.pos_embed)
-
-        self.add_parameter("cls_token", self.cls_token)
-        self.pos_drop = nn.Dropout(p=config.drop_rate)
-        self.gradient_checkpointing = config.gradient_checkpointing
-        logger.info("self.gradient_checkpointing:{}".format(
-            self.gradient_checkpointing))
-        dpr = np.linspace(0, config.drop_path_rate, config.depth)
-
-        self.blocks = nn.LayerList([
-            Block(
-                dim=config.embed_dim,
-                num_heads=config.num_heads,
-                mlp_ratio=config.mlp_ratio,
-                qkv_bias=config.qkv_bias,
-                qk_scale=config.qk_scale,
-                drop=config.drop_rate,
-                attn_drop=config.attn_drop_rate,
-                drop_path=dpr[i],
-                norm_layer=config.norm_layer,
-                epsilon=config.epsilon,
-                window_size=self.window_size,
-                mp_degree=mp_degree, ) for i in range(config.depth)
-        ])
-
-        self.mp_degree = mp_degree
-        if self.pos_embed is not None:
-            trunc_normal_(self.pos_embed)
-        trunc_normal_(self.cls_token)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Linear, fleet.meta_parallel.ColumnParallelLinear)):
-            trunc_normal_(m.weight)
-            if (isinstance(m, (nn.Linear,
-                               fleet.meta_parallel.ColumnParallelLinear)) and
-                    m.bias is not None):
-                zeros_(m.bias)
-        elif isinstance(m, nn.LayerNorm):
-            zeros_(m.bias)
-            ones_(m.weight)
-
-    def forward_features(self, x):
-        # B = x.shape[0]
-        B = paddle.shape(x)[0]
-        x = self.patch_embed(x)
-        cls_tokens = self.cls_token.expand((B, -1, -1))
-        x = paddle.concat((cls_tokens, x), axis=1)
-
-        if self.pos_embed is not None:
-            x = x + self.pos_embed
-        if self.mp_degree > 1:
-            with get_rng_state_tracker().rng_state("global_seed"):
-                x = self.pos_drop(x)
-        else:
-            x = self.pos_drop(x)
-        rel_pos_bias = self.rel_pos_bias() if hasattr(self,
-                                                      "rel_pos_bias") else None
-        for blk in self.blocks:
-            if self.gradient_checkpointing and self.training:
-
-                x = recompute(blk, x, rel_pos_bias=rel_pos_bias)
-            else:
-                x = blk(x, rel_pos_bias=rel_pos_bias)
-        # x = self.norm(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        return x
-
-
-def interpolate_pos_embed(model, checkpoint_model):
-    if "visual_encoder.pos_embed" in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model["visual_encoder.pos_embed"]
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.visual_encoder.patch_embed.num_patches
-        num_extra_tokens = model.visual_encoder.pos_embed.shape[
-            -2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
-                        0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches**0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" %
-                  (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(
-                (-1, orig_size, orig_size, embedding_size)).transpose(
-                    (0, 3, 1, 2))
-            pos_tokens = paddle.nn.functional.interpolate(
-                pos_tokens,
-                size=(new_size, new_size),
-                mode="bicubic",
-                align_corners=False, )
-            pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2)
-            new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
-            checkpoint_model["visual_encoder.pos_embed"] = new_pos_embed
-    elif "pos_embed" in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model["pos_embed"]
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.patch_embed.num_patches
-        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
-                        0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches**0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" %
-                  (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(
-                (-1, orig_size, orig_size, embedding_size)).transpose(
-                    (0, 3, 1, 2))
-            pos_tokens = paddle.nn.functional.interpolate(
-                pos_tokens,
-                size=(new_size, new_size),
-                mode="bicubic",
-                align_corners=False, )
-            pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2)
-            new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
-            checkpoint_model["pos_embed"] = new_pos_embed
diff --git a/paddlevlp/trainer/trainer.py b/paddlevlp/trainer/trainer.py
deleted file mode 100644
index 8b2e44bd86de8..0000000000000
--- a/paddlevlp/trainer/trainer.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-from paddle.io import DataLoader
-from paddlenlp.trainer.trainer import Trainer
-from tensorboardX import SummaryWriter
-
-from paddlemix.models.evaclip.utils import clip_grad_norm
-
-
-class CLIPTrainer(Trainer):
-    def __init__(self, **kwargs):
-        """
-        Implementation of an `Trainer` suitable for EVA-CLIP
-        1、selfdefine optimizer for sharding which can't create by passing by args
-
-        Args:
-            kwargs (dict): any arugments to pass to `Trainer`
-
-        Returns:
-            None
-        """
-        super().__init__(**kwargs)
-        self.rank = paddle.distributed.get_rank()
-        if self.rank == 0 and self.args.tensorboard:
-            self.writer = SummaryWriter("output/tensorboard")
-            self.logstep = 0
-
-    def training_step(self, model, inputs) -> paddle.Tensor:
-        """
-        Perform a training step on a batch of inputs.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (`nn.Layer`):
-                The model to train.
-            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-
-        Return:
-            `paddle.Tensor`: The tensor with training loss on this batch.
-        """
-
-        if self.args.pipeline_parallel_degree > 1:
-            return self.training_pipeline_step(model, inputs)
-
-        model.train()
-        inputs = self._prepare_inputs(inputs)
-
-        with self.autocast_smart_context_manager():
-            loss, outputs = self.compute_loss(model, inputs, return_outputs=1)
-        loss_itc, image_features, text_features, logit_scale = outputs
-
-        if self.args.gradient_accumulation_steps > 1:
-            loss = loss / self.args.gradient_accumulation_steps
-
-        if self.do_grad_scaling:
-            self.scaler.scale(loss).backward()
-        else:
-            loss.backward()
-
-        if self.args.max_grad_norm > 0.0:
-            grad_norms = clip_grad_norm(model, self.args.max_grad_norm)
-        if self.rank == 0 and self.args.tensorboard:
-            self.logstep += 1
-            self.writer.add_scalar("train/loss", loss.item(), self.logstep)
-            self.writer.add_scalar("train/grad_norm",
-                                   grad_norms.item(), self.logstep)
-            self.writer.add_scalar("train/logit_scale",
-                                   logit_scale.item(), self.logstep)
-
-        return loss.detach()
-
-    def get_train_dataloader(self):
-        """
-        Returns the training [`~paddle.io.DataLoader`].
-
-        Will use no sampler if `self.train_dataset` does not implement `__len__`, a random sampler (adapted to
-        distributed training if necessary) otherwise.
-
-        Subclass and override this method if you want to inject some custom behavior.
-        """
-
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.args.per_device_train_batch_size,
-            collate_fn=self.data_collator,
-            num_workers=self.args.dataloader_num_workers,
-            prefetch_factor=1,
-            shuffle=False, )
diff --git a/ppdiffusers/deploy/controlnet/export_model.py b/ppdiffusers/deploy/controlnet/export_model.py
index b6b3b146277a8..c2a406db5acc9 100644
--- a/ppdiffusers/deploy/controlnet/export_model.py
+++ b/ppdiffusers/deploy/controlnet/export_model.py
@@ -18,36 +18,42 @@
 
 import paddle
 
-from ppdiffusers import (ControlNetModel, FastDeployRuntimeModel,
-                         FastDeployStableDiffusionControlNetPipeline,
-                         StableDiffusionControlNetPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    ControlNetModel,
+    FastDeployRuntimeModel,
+    FastDeployStableDiffusionControlNetPipeline,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+)
 
 
 class ControlNetWithUnetModel(paddle.nn.Layer):
     def __init__(
-            self,
-            unet,
-            controlnet, ):
+        self,
+        unet,
+        controlnet,
+    ):
         super().__init__()
         self.unet = unet
         self.controlnet = controlnet
 
     def forward(
-            self,
-            sample,
-            timestep,
-            encoder_hidden_states,
-            controlnet_cond,
-            controlnet_conditioning_scale,
-            return_dict=True, ):
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        controlnet_cond,
+        controlnet_conditioning_scale,
+        return_dict=True,
+    ):
         down_block_res_samples, mid_block_res_sample = self.controlnet(
             sample,
             timestep,
             encoder_hidden_states=encoder_hidden_states,
             controlnet_cond=controlnet_cond,
             conditioning_scale=controlnet_conditioning_scale,
-            return_dict=False, )
+            return_dict=False,
+        )
 
         noise_pred = self.unet(
             sample,
@@ -55,21 +61,21 @@ def forward(
             encoder_hidden_states=encoder_hidden_states,
             down_block_additional_residuals=down_block_res_samples,
             mid_block_additional_residual=mid_block_res_sample,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         return noise_pred
 
 
 def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        model_path: str,
-        controlnet_model_path: str,
-        output_path: str,
-        sample: bool=False,
-        height: int=None,
-        width: int=None, ):
-    unet_tmp = UNet2DConditionModel.from_pretrained(
-        model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
-    controlnet_tmp = ControlNetModel.from_pretrained(
-        controlnet_model_path, resnet_pre_temb_non_linearity=True)
+    model_path: str,
+    controlnet_model_path: str,
+    output_path: str,
+    sample: bool = False,
+    height: int = None,
+    width: int = None,
+):
+    unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
+    controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=True)
 
     pipeline = StableDiffusionControlNetPipeline.from_pretrained(
         model_path,
@@ -77,7 +83,8 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
         controlnet=controlnet_tmp,
         safety_checker=None,
         feature_extractor=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     # make sure we disable xformers
     pipeline.disable_xformers_memory_efficient_attention()
     output_path = Path(output_path)
@@ -85,8 +92,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
     latent_height = height // 8 if height is not None else None
     latent_width = width // 8 if width is not None else None
     # get arguments
-    cross_attention_dim = (
-        pipeline.unet.config.cross_attention_dim)  # 768 or 1024 or 1280
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
     unet_channels = pipeline.unet.config.in_channels  # 4
     vae_in_channels = pipeline.vae.config.in_channels  # 3
     vae_latent_channels = pipeline.vae.config.latent_channels  # 4
@@ -94,14 +100,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
         f"cross_attention_dim: {cross_attention_dim}\n",
         f"unet_in_channels: {unet_channels}\n",
         f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}", )
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
     # 1. Convert text_encoder
     text_encoder = paddle.jit.to_static(
         pipeline.text_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, None], dtype="int64", name="input_ids")
-        ],  # input_ids
+        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
     )
     save_path = os.path.join(args.output_path, "text_encoder", "inference")
     paddle.jit.save(text_encoder, save_path)
@@ -109,8 +113,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
     del pipeline.text_encoder
 
     # wrap unet + controlnet
-    new_unet = ControlNetWithUnetModel(
-        unet=pipeline.unet, controlnet=pipeline.controlnet)
+    new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet)
 
     # 2. Convert unet
     unet = paddle.jit.to_static(
@@ -119,23 +122,26 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
             paddle.static.InputSpec(
                 shape=[None, unet_channels, latent_height, latent_width],
                 dtype="float32",
-                name="sample", ),  # sample
-            paddle.static.InputSpec(
-                shape=[1], dtype="float32", name="timestep"),  # timestep
+                name="sample",
+            ),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
             paddle.static.InputSpec(
                 shape=[None, None, cross_attention_dim],
                 dtype="float32",
-                name="encoder_hidden_states", ),  # encoder_hidden_states
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
             paddle.static.InputSpec(
                 shape=[None, vae_in_channels, height, width],
                 dtype="float32",
-                name="controlnet_cond", ),  # controlnet_cond
+                name="controlnet_cond",
+            ),  # controlnet_cond
             paddle.static.InputSpec(
                 shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1],
                 dtype="float32",
                 name="controlnet_conditioning_scale",
             ),  # controlnet_conditioning_scale
-        ], )
+        ],
+    )
 
     save_path = os.path.join(args.output_path, "unet", "inference")
     paddle.jit.save(unet, save_path)
@@ -152,8 +158,7 @@ def forward_vae_encoder_sample(self, z):
     # 3. Convert vae encoder
     vae_encoder = pipeline.vae
     if sample:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample,
-                                         vae_encoder)
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
     else:
         vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
 
@@ -165,7 +170,8 @@ def forward_vae_encoder_sample(self, z):
                 dtype="float32",
                 name="sample",  # N, C, H, W
             ),  # latent
-        ], )
+        ],
+    )
     # Save vae_encoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_encoder", "inference")
     paddle.jit.save(vae_encoder, save_path)
@@ -184,8 +190,10 @@ def forward_vae_decoder(self, z):
             paddle.static.InputSpec(
                 shape=[None, vae_latent_channels, latent_height, latent_width],
                 dtype="float32",
-                name="latent_sample", ),  # latent_sample
-        ], )
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
     # Save vae_decoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_decoder", "inference")
     paddle.jit.save(vae_decoder, save_path)
@@ -193,18 +201,16 @@ def forward_vae_decoder(self, z):
     del pipeline.vae
 
     fastdeploy_pipeline = FastDeployStableDiffusionControlNetPipeline(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_decoder"),
-        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                            "text_encoder"),
+        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
         unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
         tokenizer=pipeline.tokenizer,
         scheduler=pipeline.scheduler,
         safety_checker=None,
         feature_extractor=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     fastdeploy_pipeline.save_pretrained(output_path)
     print("FastDeploy pipeline saved to", output_path)
 
@@ -224,26 +230,25 @@ def forward_vae_decoder(self, z):
         default="lllyasviel/sd-controlnet-canny",
         help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path  checkpoint to convert (either a local directory or on the bos).",
     )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Path to the output model.")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
     parser.add_argument(
         "--sample",
         action="store_true",
         default=False,
-        help="Export the vae encoder in mode or sample", )
+        help="Export the vae encoder in mode or sample",
+    )
     parser.add_argument(
         "--height",
         type=int,
         default=None,
-        help="The height of output images. Default: None", )
+        help="The height of output images. Default: None",
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
-        help="The width of output images. Default: None", )
+        help="The width of output images. Default: None",
+    )
     args = parser.parse_args()
 
     convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
@@ -252,4 +257,5 @@ def forward_vae_decoder(self, z):
         args.output_path,
         args.sample,
         args.height,
-        args.width, )
+        args.width,
+    )
diff --git a/ppdiffusers/deploy/controlnet/infer.py b/ppdiffusers/deploy/controlnet/infer.py
index 3e516abb02cc0..10350965eb703 100644
--- a/ppdiffusers/deploy/controlnet/infer.py
+++ b/ppdiffusers/deploy/controlnet/infer.py
@@ -27,8 +27,7 @@
 from PIL import Image
 from tqdm.auto import trange
 
-from ppdiffusers import (DiffusionPipeline,
-                         FastDeployStableDiffusionMegaPipeline)
+from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline
 from ppdiffusers.utils import load_image
 
 
@@ -48,17 +47,20 @@ def parse_arguments():
     parser.add_argument(
         "--model_dir",
         default="runwayml/stable-diffusion-v1-5@fastdeploy",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=1,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--backend",
         type=str,
@@ -78,7 +80,8 @@ def parse_arguments():
             "huawei_ascend_npu",
             "kunlunxin_xpu",
         ],
-        help="The inference runtime device of models.", )
+        help="The inference runtime device of models.",
+    )
     parser.add_argument(
         "--task_name",
         type=str,
@@ -100,17 +103,10 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -131,7 +127,8 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
+        help="The scheduler type of stable diffusion.",
+    )
     parser.add_argument(
         "--infer_op",
         type=str,
@@ -141,33 +138,25 @@ def parse_arguments():
             "raw",
             "all",
         ],
-        help="The type of infer op.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
-    parser.add_argument(
-        "--hr_resize_height",
-        type=int,
-        default=768,
-        help="HR Height of input image")
-    parser.add_argument(
-        "--hr_resize_width",
-        type=int,
-        default=768,
-        help="HR Width of input image")
-    parser.add_argument(
-        "--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+        help="The type of infer op.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
     parser.add_argument(
         "--low_threshold",
         type=int,
         default=100,
-        help="The value of Canny low threshold.", )
+        help="The value of Canny low threshold.",
+    )
     parser.add_argument(
         "--high_threshold",
         type=int,
         default=200,
-        help="The value of Canny high threshold.", )
+        help="The value of Canny high threshold.",
+    )
     return parser.parse_args()
 
 
@@ -182,14 +171,15 @@ def create_ort_runtime(device_id=0):
 
 
 def create_paddle_inference_runtime(
-        use_trt=False,
-        dynamic_shape=None,
-        use_fp16=False,
-        device_id=0,
-        disable_paddle_trt_ops=[],
-        disable_paddle_pass=[],
-        paddle_stream=None,
-        workspace=None, ):
+    use_trt=False,
+    dynamic_shape=None,
+    use_fp16=False,
+    device_id=0,
+    disable_paddle_trt_ops=[],
+    disable_paddle_pass=[],
+    paddle_stream=None,
+    workspace=None,
+):
     option = fd.RuntimeOption()
     option.use_paddle_backend()
     if device_id == -1:
@@ -227,7 +217,8 @@ def create_paddle_inference_runtime(
                     key,
                     shape_dict["min_shape"],
                     shape_dict.get("opt_shape", None),
-                    shape_dict.get("max_shape", None), )
+                    shape_dict.get("max_shape", None),
+                )
     return option
 
 
@@ -238,8 +229,10 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
         option.use_ascend()
         option.set_lite_device_names(["huawei_ascend_npu"])
         option.set_lite_context_properties(
-            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".
-            format(device_id))
+            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
+                device_id
+            )
+        )
     elif device == "kunlunxin_xpu":
         # TODO(shentanyue): Add kunlunxin_xpu code
         # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
@@ -251,7 +244,8 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
             autotune_file="",
             precision="int16",
             adaptive_seqlen=True,
-            enable_multi_stream=True, )
+            enable_multi_stream=True,
+        )
         if use_fp16:
             option.enable_lite_fp16()
     else:
@@ -259,10 +253,7 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
     return option
 
 
-def create_trt_runtime(workspace=(1 << 31),
-                       dynamic_shape=None,
-                       use_fp16=False,
-                       device_id=0):
+def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
     option = fd.RuntimeOption()
     option.use_trt_backend()
     option.use_gpu(device_id)
@@ -276,7 +267,8 @@ def create_trt_runtime(workspace=(1 << 31),
                 key,
                 min_shape=shape_dict["min_shape"],
                 opt_shape=shape_dict.get("opt_shape", None),
-                max_shape=shape_dict.get("max_shape", None), )
+                max_shape=shape_dict.get("max_shape", None),
+            )
     # cache_file = os.path.join(model_dir, model_prefix, "inference.trt")
     # option.set_trt_cache_file(cache_file)
     return option
@@ -288,8 +280,7 @@ def main(args):
         paddle_stream = None
     else:
         paddle.set_device(f"gpu:{args.device_id}")
-        paddle_stream = paddle.device.cuda.current_stream(
-            args.device_id).cuda_stream
+        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
 
     infer_op_dict = {
         "vae_encoder": args.infer_op,
@@ -323,12 +314,9 @@ def main(args):
     }
     vae_decoder_dynamic_shape = {
         "latent_sample": {
-            "min_shape":
-            [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape":
-            [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape":
-            [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
+            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
         }
     }
     unet_dynamic_shape = {
@@ -379,37 +367,38 @@ def main(args):
             text_encoder=create_ort_runtime(device_id=args.device_id),
             vae_encoder=create_ort_runtime(device_id=args.device_id),
             vae_decoder=create_ort_runtime(device_id=args.device_id),
-            unet=create_ort_runtime(device_id=args.device_id), )
+            unet=create_ort_runtime(device_id=args.device_id),
+        )
     elif args.backend == "paddlelite":
         runtime_options = dict(
-            text_encoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_encoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_decoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            unet=create_paddle_lite_runtime(
-                device=args.device,
-                device_id=args.device_id,
-                use_fp16=args.use_fp16), )
+            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
+        )
     elif args.backend == "tensorrt":
         runtime_options = dict(
             text_encoder=create_trt_runtime(
                 dynamic_shape=text_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             vae_encoder=create_trt_runtime(
                 dynamic_shape=vae_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             vae_decoder=create_trt_runtime(
                 dynamic_shape=vae_decoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             unet=create_trt_runtime(
                 dynamic_shape=unet_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ), )
+                device_id=args.device_id,
+            ),
+        )
     elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
         args.use_trt = args.backend == "paddle_tensorrt"
         runtime_options = dict(
@@ -419,28 +408,34 @@ def main(args):
                 use_fp16=args.use_fp16,
                 device_id=args.device_id,
                 disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             vae_encoder=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=vae_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             vae_decoder=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=vae_decoder_dynamic_shape,
                 use_fp16=args.use_fp16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             unet=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=unet_dynamic_shape,
                 use_fp16=args.use_fp16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ), )
+                paddle_stream=paddle_stream,
+            ),
+        )
     pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
         args.model_dir,
-        runtime_options=runtime_options, )
+        runtime_options=runtime_options,
+    )
     pipe.set_progress_bar_config(disable=True)
     pipe.change_scheduler(args.scheduler)
     parse_prompt_type = args.parse_prompt_type
@@ -454,9 +449,7 @@ def main(args):
     else:
         infer_op_list = [args.infer_op]
     if args.device == "kunlunxin_xpu" or args.backend == "paddle":
-        print(
-            "When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op."
-        )
+        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
         infer_op_list = ["raw"]
 
     for infer_op in infer_op_list:
@@ -466,8 +459,7 @@ def main(args):
             "text_encoder": infer_op,
             "unet": infer_op,
         }
-        folder = (f"infer_op_{infer_op}_fp16"
-                  if args.use_fp16 else f"infer_op_{infer_op}_fp32")
+        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
         os.makedirs(folder, exist_ok=True)
 
         if args.task_name in ["text2img_control", "all"]:
@@ -487,7 +479,8 @@ def main(args):
                 parse_prompt_type=parse_prompt_type,
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test text2img_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -500,7 +493,8 @@ def main(args):
                     parse_prompt_type=parse_prompt_type,
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -526,7 +520,8 @@ def main(args):
                 parse_prompt_type=parse_prompt_type,
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test img2img_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -540,7 +535,8 @@ def main(args):
                     parse_prompt_type=parse_prompt_type,
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -551,7 +547,9 @@ def main(args):
             images[0].save(f"{folder}/img2img_control.png")
 
         if args.task_name in ["inpaint_legacy_control", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
             mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
             init_image = load_image(img_url)
             mask_image = load_image(mask_url)
@@ -569,7 +567,8 @@ def main(args):
                 parse_prompt_type=parse_prompt_type,
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test inpaint_legacy_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -584,7 +583,8 @@ def main(args):
                     parse_prompt_type=parse_prompt_type,
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -636,7 +636,8 @@ def main(args):
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
                 parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test hiresfix_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -653,7 +654,8 @@ def main(args):
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
                     parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/controlnet/infer_dygraph.py b/ppdiffusers/deploy/controlnet/infer_dygraph.py
index 94204a1a5bc77..89bd4d1e51aa9 100644
--- a/ppdiffusers/deploy/controlnet/infer_dygraph.py
+++ b/ppdiffusers/deploy/controlnet/infer_dygraph.py
@@ -60,12 +60,14 @@ def parse_arguments():
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=1,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--task_name",
         type=str,
@@ -87,12 +89,9 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
     parser.add_argument(
         "--guess_mode",
         type=strtobool,
@@ -104,12 +103,9 @@ def parse_arguments():
         type=str,
         default="raw",
         choices=["raw", "cutlass", "flash", "all"],
-        help="attention_type.", )
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -129,31 +125,24 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
-    parser.add_argument(
-        "--hr_resize_height",
-        type=int,
-        default=768,
-        help="HR Height of input image")
-    parser.add_argument(
-        "--hr_resize_width",
-        type=int,
-        default=768,
-        help="HR Width of input image")
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
     parser.add_argument(
         "--low_threshold",
         type=int,
         default=100,
-        help="The value of Canny low threshold.", )
+        help="The value of Canny low threshold.",
+    )
     parser.add_argument(
         "--high_threshold",
         type=int,
         default=200,
-        help="The value of Canny high threshold.", )
+        help="The value of Canny high threshold.",
+    )
     return parser.parse_args()
 
 
@@ -165,8 +154,8 @@ def main(args):
     seed = 1024
     paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
     controlnet = ControlNetModel.from_pretrained(
-        args.controlnet_pretrained_model_name_or_path,
-        paddle_dtype=paddle_dtype)
+        args.controlnet_pretrained_model_name_or_path, paddle_dtype=paddle_dtype
+    )
     pipe = DiffusionPipeline.from_pretrained(
         args.pretrained_model_name_or_path,
         controlnet=controlnet,
@@ -174,7 +163,8 @@ def main(args):
         feature_extractor=None,
         requires_safety_checker=False,
         paddle_dtype=paddle_dtype,
-        custom_pipeline="stable_diffusion_mega", )
+        custom_pipeline="stable_diffusion_mega",
+    )
     pipe.set_progress_bar_config(disable=True)
     pipe.change_scheduler(args.scheduler)
     parse_prompt_type = args.parse_prompt_type
@@ -200,17 +190,14 @@ def main(args):
                     raise ValueError(e)
 
         if not args.use_fp16 and attention_type == "flash":
-            print(
-                "Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!"
-            )
+            print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
             continue
         guess_mode = args.guess_mode
         width = args.width
         height = args.height
         hr_resize_width = args.hr_resize_width
         hr_resize_height = args.hr_resize_height
-        folder = (f"attn_{attention_type}_fp16"
-                  if args.use_fp16 else f"attn_{attention_type}_fp32")
+        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
         os.makedirs(folder, exist_ok=True)
         if args.task_name in ["text2img_control", "all"]:
             init_image = load_image(
@@ -229,7 +216,8 @@ def main(args):
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
                 guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print("==> Test text2img_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -242,7 +230,8 @@ def main(args):
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
                     guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -268,7 +257,8 @@ def main(args):
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
                 guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print("==> Test img2img_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -282,7 +272,8 @@ def main(args):
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
                     guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -293,7 +284,9 @@ def main(args):
             images[0].save(f"{folder}/img2img_control.png")
 
         if args.task_name in ["inpaint_legacy_control", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
             mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
             init_image = load_image(img_url)
             mask_image = load_image(mask_url)
@@ -311,7 +304,8 @@ def main(args):
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
                 guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print(f"==> Test {task_name} performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -326,7 +320,8 @@ def main(args):
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
                     guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -358,7 +353,8 @@ def main(args):
                 controlnet_cond=controlnet_cond,
                 controlnet_conditioning_scale=1.0,
                 guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print("==> Test hiresfix_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -375,7 +371,8 @@ def main(args):
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=1.0,
                     guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py b/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py
index 673834d7dbd52..64e67ac852da1 100644
--- a/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py
+++ b/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py
@@ -18,23 +18,31 @@
 
 import torch
 
-torch.nn.functional.scaled_dot_product_attention_ = (
-    torch.nn.functional.scaled_dot_product_attention)
+torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
 delattr(torch.nn.functional, "scaled_dot_product_attention")
 
 import cv2
 import numpy as np
 from diffusers import (
-    ControlNetModel, DDIMScheduler, DDPMScheduler, DEISMultistepScheduler,
-    DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
+    ControlNetModel,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
     StableDiffusionControlNetImg2ImgPipeline,
-    StableDiffusionControlNetInpaintPipeline, StableDiffusionControlNetPipeline,
-    UniPCMultistepScheduler)
-from diffusers.models.attention_processor import (AttnProcessor,
-                                                  AttnProcessor2_0)
+    StableDiffusionControlNetInpaintPipeline,
+    StableDiffusionControlNetPipeline,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
 from diffusers.utils import load_image
 from PIL import Image
 from tqdm.auto import trange
@@ -67,46 +75,40 @@ def change_scheduler(self, scheduler_type="ddim"):
     self.orginal_scheduler_config = self.scheduler.config
     scheduler_type = scheduler_type.lower()
     if scheduler_type == "pndm":
-        scheduler = PNDMScheduler.from_config(
-            self.orginal_scheduler_config, skip_prk_steps=True)
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
     elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "dpm-multi":
-        scheduler = DPMSolverMultistepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "dpm-single":
-        scheduler = DPMSolverSinglestepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "kdpm2-ancestral":
-        scheduler = KDPM2AncestralDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "kdpm2":
-        scheduler = KDPM2DiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "unipc-multi":
-        scheduler = UniPCMultistepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "ddim":
         scheduler = DDIMScheduler.from_config(
             self.orginal_scheduler_config,
             steps_offset=1,
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
     elif scheduler_type == "ddpm":
-        scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, )
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
     elif scheduler_type == "deis-multi":
         scheduler = DEISMultistepScheduler.from_config(
-            self.orginal_scheduler_config, )
+            self.orginal_scheduler_config,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
     return scheduler
@@ -131,12 +133,14 @@ def parse_arguments():
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=10,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--task_name",
         type=str,
@@ -157,20 +161,17 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
     parser.add_argument(
         "--channels_last",
         type=strtobool,
         default=False,
-        help="Wheter to use channels_last", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
     parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
-    parser.add_argument(
-        "--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
     parser.add_argument(
         "--attention_type",
         type=str,
@@ -179,12 +180,9 @@ def parse_arguments():
             "raw",
             "sdp",
         ],
-        help="attention_type.", )
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -204,21 +202,22 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
     parser.add_argument(
         "--low_threshold",
         type=int,
         default=100,
-        help="The value of Canny low threshold.", )
+        help="The value of Canny low threshold.",
+    )
     parser.add_argument(
         "--high_threshold",
         type=int,
         default=200,
-        help="The value of Canny high threshold.", )
+        help="The value of Canny high threshold.",
+    )
     return parser.parse_args()
 
 
@@ -272,14 +271,16 @@ def main(args):
     seed = 1024
     torch_dtype = torch.float16 if args.use_fp16 else torch.float32
     controlnet = ControlNetModel.from_pretrained(
-        args.controlnet_pretrained_model_name_or_path, torch_dtype=torch_dtype)
+        args.controlnet_pretrained_model_name_or_path, torch_dtype=torch_dtype
+    )
     pipe = StableDiffusionControlNetPipeline.from_pretrained(
         args.pretrained_model_name_or_path,
         controlnet=controlnet,
         safety_checker=None,
         feature_extractor=None,
         requires_safety_checker=False,
-        torch_dtype=torch_dtype, )
+        torch_dtype=torch_dtype,
+    )
     scheduler = change_scheduler(pipe, args.scheduler)
     pipe.scheduler = scheduler
     if args.device_id >= 0:
@@ -291,11 +292,9 @@ def main(args):
         args.attention_type = [args.attention_type]
 
     for attention_type in args.attention_type:
-        attn_prrocessor_cls = (AttnProcessor
-                               if attention_type == "raw" else AttnProcessor2_0)
+        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
         if attention_type == "sdp":
-            torch.nn.functional.scaled_dot_product_attention = (
-                torch.nn.functional.scaled_dot_product_attention_)
+            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
         set_attn_processor(pipe.unet, attn_prrocessor_cls())
         set_attn_processor(pipe.vae, attn_prrocessor_cls())
         set_attn_processor(pipe.controlnet, attn_prrocessor_cls())
@@ -306,24 +305,20 @@ def main(args):
 
         if args.compile:
             print("Run torch compile")
-            pipe.unet = torch.compile(
-                pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.controlnet = torch.compile(
-                pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
 
         width = args.width
         height = args.height
         pipe.set_progress_bar_config(disable=True)
 
-        folder = (f"torch_attn_{attention_type}_fp16"
-                  if args.use_fp16 else f"torch_attn_{attention_type}_fp32")
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
         os.makedirs(folder, exist_ok=True)
         if args.task_name in ["text2img_control", "all"]:
             init_image = load_image(
                 "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
             )
-            controlnet_cond = get_canny_image(init_image, args).resize(
-                (width, height))
+            controlnet_cond = get_canny_image(init_image, args).resize((width, height))
             # text2img
             prompt = "bird"
             time_costs = []
@@ -334,7 +329,8 @@ def main(args):
                 height=height,
                 width=width,
                 image=controlnet_cond,
-                controlnet_conditioning_scale=1.0, )
+                controlnet_conditioning_scale=1.0,
+            )
             print("==> Test text2img_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -345,7 +341,8 @@ def main(args):
                     height=height,
                     width=width,
                     image=controlnet_cond,
-                    controlnet_conditioning_scale=1.0, ).images
+                    controlnet_conditioning_scale=1.0,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -356,13 +353,11 @@ def main(args):
             images[0].save(f"{folder}/text2img_control.png")
 
         if args.task_name in ["img2img_control", "all"]:
-            pipe_img2img = StableDiffusionControlNetImg2ImgPipeline(
-                **pipe.components)
+            pipe_img2img = StableDiffusionControlNetImg2ImgPipeline(**pipe.components)
             pipe_img2img.set_progress_bar_config(disable=True)
             img_url = "sketch-mountains-input.png"
             init_image = load_image(img_url).resize((width, height))
-            controlnet_cond = get_canny_image(init_image, args).resize(
-                (width, height))
+            controlnet_cond = get_canny_image(init_image, args).resize((width, height))
             prompt = "A fantasy landscape, trending on artstation"
             time_costs = []
             # warmup
@@ -373,7 +368,8 @@ def main(args):
                 height=height,
                 width=width,
                 control_image=controlnet_cond,
-                controlnet_conditioning_scale=1.0, )
+                controlnet_conditioning_scale=1.0,
+            )
             print("==> Test img2img_control performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -385,7 +381,8 @@ def main(args):
                     height=height,
                     width=width,
                     control_image=controlnet_cond,
-                    controlnet_conditioning_scale=1.0, ).images
+                    controlnet_conditioning_scale=1.0,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -396,15 +393,15 @@ def main(args):
             images[0].save(f"{folder}/img2img_control.png")
 
         if args.task_name in ["inpaint_legacy_control", "all"]:
-            pipe_inpaint = StableDiffusionControlNetInpaintPipeline(
-                **pipe.components)
+            pipe_inpaint = StableDiffusionControlNetInpaintPipeline(**pipe.components)
             pipe_inpaint.set_progress_bar_config(disable=True)
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
             mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
             init_image = load_image(img_url).resize((width, height))
             mask_image = load_image(mask_url).resize((width, height))
-            controlnet_cond = get_canny_image(init_image, args).resize(
-                (width, height))
+            controlnet_cond = get_canny_image(init_image, args).resize((width, height))
             prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
             time_costs = []
             task_name = "inpaint_legacy_control"
@@ -416,7 +413,8 @@ def main(args):
                 height=height,
                 width=width,
                 control_image=controlnet_cond,
-                controlnet_conditioning_scale=1.0, )
+                controlnet_conditioning_scale=1.0,
+            )
             print(f"==> Test {task_name} performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -429,7 +427,8 @@ def main(args):
                     height=height,
                     width=width,
                     control_image=controlnet_cond,
-                    controlnet_conditioning_scale=1.0, ).images
+                    controlnet_conditioning_scale=1.0,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/export_model.py b/ppdiffusers/deploy/export_model.py
index b7defe65362ce..00b9b4fd03f4a 100644
--- a/ppdiffusers/deploy/export_model.py
+++ b/ppdiffusers/deploy/export_model.py
@@ -19,26 +19,27 @@
 
 import paddle
 
-from ppdiffusers import (FastDeployRuntimeModel,
-                         FastDeployStableDiffusionInpaintPipeline,
-                         FastDeployStableDiffusionMegaPipeline,
-                         StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    FastDeployRuntimeModel,
+    FastDeployStableDiffusionInpaintPipeline,
+    FastDeployStableDiffusionMegaPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 
 
 def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        model_path: str,
-        output_path: str,
-        sample: bool=False,
-        height: int=None,
-        width: int=None, ):
+    model_path: str,
+    output_path: str,
+    sample: bool = False,
+    height: int = None,
+    width: int = None,
+):
     # specify unet model with unet pre_temb_act opt enabled.
-    unet_model = UNet2DConditionModel.from_pretrained(
-        model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
+    unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
     pipeline = StableDiffusionPipeline.from_pretrained(
-        model_path,
-        unet=unet_model,
-        safety_checker=None,
-        feature_extractor=None)
+        model_path, unet=unet_model, safety_checker=None, feature_extractor=None
+    )
     # make sure we disable xformers
     pipeline.disable_xformers_memory_efficient_attention()
     output_path = Path(output_path)
@@ -46,8 +47,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
     latent_height = height // 8 if height is not None else None
     latent_width = width // 8 if width is not None else None
     # get arguments
-    cross_attention_dim = (
-        pipeline.unet.config.cross_attention_dim)  # 768 or 1024 or 1280
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
     unet_channels = pipeline.unet.config.in_channels  # 4 or 9
     vae_in_channels = pipeline.vae.config.in_channels  # 3
     vae_latent_channels = pipeline.vae.config.latent_channels  # 4
@@ -55,14 +55,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
         f"cross_attention_dim: {cross_attention_dim}\n",
         f"unet_in_channels: {unet_channels}\n",
         f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}", )
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
     # 1. Convert text_encoder
     text_encoder = paddle.jit.to_static(
         pipeline.text_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, None], dtype="int64", name="input_ids")
-        ],  # input_ids
+        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
     )
     save_path = os.path.join(args.output_path, "text_encoder", "inference")
     paddle.jit.save(text_encoder, save_path)
@@ -76,14 +74,16 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
             paddle.static.InputSpec(
                 shape=[None, unet_channels, latent_height, latent_width],
                 dtype="float32",
-                name="sample", ),  # sample
-            paddle.static.InputSpec(
-                shape=[1], dtype="float32", name="timestep"),  # timestep
+                name="sample",
+            ),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
             paddle.static.InputSpec(
                 shape=[None, None, cross_attention_dim],
                 dtype="float32",
-                name="encoder_hidden_states", ),  # encoder_hidden_states
-        ], )
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
+        ],
+    )
     save_path = os.path.join(args.output_path, "unet", "inference")
     paddle.jit.save(unet, save_path)
     print(f"Save unet model in {save_path} successfully.")
@@ -98,8 +98,7 @@ def forward_vae_encoder_sample(self, z):
     # 3. Convert vae encoder
     vae_encoder = pipeline.vae
     if sample:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample,
-                                         vae_encoder)
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
     else:
         vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
 
@@ -111,7 +110,8 @@ def forward_vae_encoder_sample(self, z):
                 dtype="float32",
                 name="sample",  # N, C, H, W
             ),  # latent
-        ], )
+        ],
+    )
     # Save vae_encoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_encoder", "inference")
     paddle.jit.save(vae_encoder, save_path)
@@ -130,8 +130,10 @@ def forward_vae_decoder(self, z):
             paddle.static.InputSpec(
                 shape=[None, vae_latent_channels, latent_height, latent_width],
                 dtype="float32",
-                name="latent_sample", ),  # latent_sample
-        ], )
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
     # Save vae_decoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_decoder", "inference")
     paddle.jit.save(vae_decoder, save_path)
@@ -144,18 +146,16 @@ def forward_vae_decoder(self, z):
         fd_pipe_cls = FastDeployStableDiffusionMegaPipeline
 
     fastdeploy_pipeline = fd_pipe_cls(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_decoder"),
-        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                            "text_encoder"),
+        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
         unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
         tokenizer=pipeline.tokenizer,
         scheduler=pipeline.scheduler,
         safety_checker=None,
         feature_extractor=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     fastdeploy_pipeline.save_pretrained(output_path)
     print("FastDeploy pipeline saved to", output_path)
 
@@ -169,26 +169,25 @@ def forward_vae_decoder(self, z):
         required=True,
         help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
     )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Path to the output model.")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
     parser.add_argument(
         "--sample",
         action="store_true",
         default=False,
-        help="Export the vae encoder in mode or sample", )
+        help="Export the vae encoder in mode or sample",
+    )
     parser.add_argument(
         "--height",
         type=int,
         default=None,
-        help="The height of output images. Default: None", )
+        help="The height of output images. Default: None",
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
-        help="The width of output images. Default: None", )
+        help="The width of output images. Default: None",
+    )
     args = parser.parse_args()
 
     convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
@@ -196,4 +195,5 @@ def forward_vae_decoder(self, z):
         args.output_path,
         args.sample,
         args.height,
-        args.width, )
+        args.width,
+    )
diff --git a/ppdiffusers/deploy/infer.py b/ppdiffusers/deploy/infer.py
index 8445343f255da..60152a7db32f4 100644
--- a/ppdiffusers/deploy/infer.py
+++ b/ppdiffusers/deploy/infer.py
@@ -25,8 +25,7 @@
 from paddlenlp.trainer.argparser import strtobool
 from tqdm.auto import trange
 
-from ppdiffusers import (DiffusionPipeline,
-                         FastDeployStableDiffusionMegaPipeline)
+from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline
 from ppdiffusers.utils import load_image
 
 
@@ -36,17 +35,20 @@ def parse_arguments():
     parser.add_argument(
         "--model_dir",
         default="runwayml/stable-diffusion-v1-5@fastdeploy",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=1,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--backend",
         type=str,
@@ -66,7 +68,8 @@ def parse_arguments():
             "huawei_ascend_npu",
             "kunlunxin_xpu",
         ],
-        help="The inference runtime device of models.", )
+        help="The inference runtime device of models.",
+    )
     parser.add_argument(
         "--task_name",
         type=str,
@@ -91,22 +94,11 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
-    parser.add_argument(
-        "--use_bf16",
-        type=strtobool,
-        default=False,
-        help="Wheter to use BF16 mode")
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode")
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -127,7 +119,8 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
+        help="The scheduler type of stable diffusion.",
+    )
     parser.add_argument(
         "--infer_op",
         type=str,
@@ -137,23 +130,13 @@ def parse_arguments():
             "raw",
             "all",
         ],
-        help="The type of infer op.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
-    parser.add_argument(
-        "--hr_resize_height",
-        type=int,
-        default=768,
-        help="HR Height of input image")
-    parser.add_argument(
-        "--hr_resize_width",
-        type=int,
-        default=768,
-        help="HR Width of input image")
-    parser.add_argument(
-        "--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+        help="The type of infer op.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
 
     return parser.parse_args()
 
@@ -169,15 +152,16 @@ def create_ort_runtime(device_id=0):
 
 
 def create_paddle_inference_runtime(
-        use_trt=False,
-        dynamic_shape=None,
-        use_fp16=False,
-        use_bf16=False,
-        device_id=0,
-        disable_paddle_trt_ops=[],
-        disable_paddle_pass=[],
-        paddle_stream=None,
-        workspace=None, ):
+    use_trt=False,
+    dynamic_shape=None,
+    use_fp16=False,
+    use_bf16=False,
+    device_id=0,
+    disable_paddle_trt_ops=[],
+    disable_paddle_pass=[],
+    paddle_stream=None,
+    workspace=None,
+):
     assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive"
     option = fd.RuntimeOption()
     option.use_paddle_backend()
@@ -218,7 +202,8 @@ def create_paddle_inference_runtime(
                     key,
                     shape_dict["min_shape"],
                     shape_dict.get("opt_shape", None),
-                    shape_dict.get("max_shape", None), )
+                    shape_dict.get("max_shape", None),
+                )
     return option
 
 
@@ -229,8 +214,10 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
         option.use_ascend()
         option.set_lite_device_names(["huawei_ascend_npu"])
         option.set_lite_context_properties(
-            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".
-            format(device_id))
+            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
+                device_id
+            )
+        )
     elif device == "kunlunxin_xpu":
         # TODO(shentanyue): Add kunlunxin_xpu code
         # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
@@ -242,7 +229,8 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
             autotune_file="",
             precision="int16",
             adaptive_seqlen=True,
-            enable_multi_stream=True, )
+            enable_multi_stream=True,
+        )
         if use_fp16:
             option.enable_lite_fp16()
     else:
@@ -250,10 +238,7 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
     return option
 
 
-def create_trt_runtime(workspace=(1 << 31),
-                       dynamic_shape=None,
-                       use_fp16=False,
-                       device_id=0):
+def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
     option = fd.RuntimeOption()
     option.use_trt_backend()
     option.use_gpu(device_id)
@@ -267,7 +252,8 @@ def create_trt_runtime(workspace=(1 << 31),
                 key,
                 min_shape=shape_dict["min_shape"],
                 opt_shape=shape_dict.get("opt_shape", None),
-                max_shape=shape_dict.get("max_shape", None), )
+                max_shape=shape_dict.get("max_shape", None),
+            )
     return option
 
 
@@ -277,8 +263,7 @@ def main(args):
         paddle_stream = None
     else:
         paddle.set_device(f"gpu:{args.device_id}")
-        paddle_stream = paddle.device.cuda.current_stream(
-            args.device_id).cuda_stream
+        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
 
     seed = 1024
     vae_in_channels = 4
@@ -314,12 +299,9 @@ def main(args):
 
     vae_decoder_dynamic_shape = {
         "latent_sample": {
-            "min_shape":
-            [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape":
-            [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape":
-            [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
+            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
         }
     }
 
@@ -361,37 +343,38 @@ def main(args):
             text_encoder=create_ort_runtime(device_id=args.device_id),
             vae_encoder=create_ort_runtime(device_id=args.device_id),
             vae_decoder=create_ort_runtime(device_id=args.device_id),
-            unet=create_ort_runtime(device_id=args.device_id), )
+            unet=create_ort_runtime(device_id=args.device_id),
+        )
     elif args.backend == "paddlelite":
         runtime_options = dict(
-            text_encoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_encoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_decoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            unet=create_paddle_lite_runtime(
-                device=args.device,
-                device_id=args.device_id,
-                use_fp16=args.use_fp16), )
+            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
+        )
     elif args.backend == "tensorrt":
         runtime_options = dict(
             text_encoder=create_trt_runtime(
                 dynamic_shape=text_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             vae_encoder=create_trt_runtime(
                 dynamic_shape=vae_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             vae_decoder=create_trt_runtime(
                 dynamic_shape=vae_decoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             unet=create_trt_runtime(
                 dynamic_shape=unet_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ), )
+                device_id=args.device_id,
+            ),
+        )
     elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
         args.use_trt = args.backend == "paddle_tensorrt"
         runtime_options = dict(
@@ -402,31 +385,37 @@ def main(args):
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
                 disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             vae_encoder=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=vae_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             vae_decoder=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=vae_decoder_dynamic_shape,
                 use_fp16=args.use_fp16,
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             unet=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=unet_dynamic_shape,
                 use_fp16=args.use_fp16,
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ), )
+                paddle_stream=paddle_stream,
+            ),
+        )
     pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
         args.model_dir,
-        runtime_options=runtime_options, )
+        runtime_options=runtime_options,
+    )
     pipe.set_progress_bar_config(disable=True)
     pipe.change_scheduler(args.scheduler)
     parse_prompt_type = args.parse_prompt_type
@@ -440,9 +429,7 @@ def main(args):
     else:
         infer_op_list = [args.infer_op]
     if args.device == "kunlunxin_xpu" or args.backend == "paddle":
-        print(
-            "When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op."
-        )
+        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
         infer_op_list = ["raw"]
 
     for infer_op in infer_op_list:
@@ -452,8 +439,7 @@ def main(args):
             "text_encoder": infer_op,
             "unet": infer_op,
         }
-        folder = (f"infer_op_{infer_op}_fp16"
-                  if args.use_fp16 else f"infer_op_{infer_op}_fp32")
+        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
         os.makedirs(folder, exist_ok=True)
         if args.task_name in ["text2img", "all"]:
             # text2img
@@ -466,7 +452,8 @@ def main(args):
                 height=height,
                 width=width,
                 parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test text2img performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -477,7 +464,8 @@ def main(args):
                     height=height,
                     width=width,
                     parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -501,7 +489,8 @@ def main(args):
                 height=height,
                 width=width,
                 parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test img2img performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -513,7 +502,8 @@ def main(args):
                     height=height,
                     width=width,
                     parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -524,7 +514,9 @@ def main(args):
             images[0].save(f"{folder}/img2img.png")
 
         if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
             mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
             init_image = load_image(img_url)
             mask_image = load_image(mask_url)
@@ -545,7 +537,8 @@ def main(args):
                 height=height,
                 width=width,
                 parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print(f"==> Test {task_name} performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -558,7 +551,8 @@ def main(args):
                     height=height,
                     width=width,
                     parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -600,7 +594,8 @@ def main(args):
                 hr_resize_height=hr_resize_height,
                 enable_hr=True,
                 parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict, )
+                infer_op_dict=infer_op_dict,
+            )
             print("==> Test hiresfix performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -614,7 +609,8 @@ def main(args):
                     hr_resize_width=hr_resize_width,
                     hr_resize_height=hr_resize_height,
                     enable_hr=True,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -626,7 +622,9 @@ def main(args):
 
         if args.task_name in ["cycle_diffusion"]:
             pipe.change_scheduler("ddim")
-            image_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
+            image_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
+            )
             init_image = load_image(image_url)
             source_prompt = "An astronaut riding a horse"
             prompt = "An astronaut riding an elephant"
@@ -644,7 +642,8 @@ def main(args):
                 height=height,
                 width=width,
                 parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict, ).images[0]
+                infer_op_dict=infer_op_dict,
+            ).images[0]
             print("==> Test cycle diffusion performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -661,7 +660,8 @@ def main(args):
                     height=height,
                     width=width,
                     parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict, ).images
+                    infer_op_dict=infer_op_dict,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -692,11 +692,13 @@ def main(args):
             time_costs = []
             # warmup
             mixture_tiling_pipe(
-                prompt=[[
-                    "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                    "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                    "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                ]],
+                prompt=[
+                    [
+                        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                        "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                        "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                    ]
+                ],
                 tile_height=512,
                 tile_width=512,
                 tile_row_overlap=0,
@@ -704,16 +706,19 @@ def main(args):
                 guidance_scale=8,
                 seed=7178915308,
                 num_inference_steps=50,
-                infer_op_dict=None, )
+                infer_op_dict=None,
+            )
             print("==> Test mixture tiling.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
                 images = mixture_tiling_pipe(
-                    prompt=[[
-                        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                        # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                        # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                    ]],
+                    prompt=[
+                        [
+                            "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                            # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                            # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                        ]
+                    ],
                     tile_height=512,
                     tile_width=512,
                     tile_row_overlap=0,
@@ -721,7 +726,8 @@ def main(args):
                     guidance_scale=8,
                     seed=7178915308,
                     num_inference_steps=50,
-                    infer_op_dict=None, )["images"]
+                    infer_op_dict=None,
+                )["images"]
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/infer_dygraph.py b/ppdiffusers/deploy/infer_dygraph.py
index 4516d4b4a6d4e..f2b42a7b1daaa 100644
--- a/ppdiffusers/deploy/infer_dygraph.py
+++ b/ppdiffusers/deploy/infer_dygraph.py
@@ -35,17 +35,20 @@ def parse_arguments():
     parser.add_argument(
         "--model_dir",
         default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=1,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--task_name",
         type=str,
@@ -69,23 +72,17 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
     parser.add_argument(
         "--attention_type",
         type=str,
         default="raw",
         choices=["raw", "cutlass", "flash", "all"],
-        help="attention_type.", )
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -105,21 +102,12 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
-    parser.add_argument(
-        "--hr_resize_height",
-        type=int,
-        default=768,
-        help="HR Height of input image")
-    parser.add_argument(
-        "--hr_resize_width",
-        type=int,
-        default=768,
-        help="HR Width of input image")
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
     return parser.parse_args()
 
 
@@ -137,7 +125,8 @@ def main(args):
         feature_extractor=None,
         requires_safety_checker=False,
         paddle_dtype=paddle_dtype,
-        custom_pipeline="stable_diffusion_mega", )
+        custom_pipeline="stable_diffusion_mega",
+    )
     pipe.set_progress_bar_config(disable=True)
     pipe.change_scheduler(args.scheduler)
     parse_prompt_type = args.parse_prompt_type
@@ -162,16 +151,13 @@ def main(args):
                     raise ValueError(e)
 
         if not args.use_fp16 and attention_type == "flash":
-            print(
-                "Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!"
-            )
+            print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
             continue
         width = args.width
         height = args.height
         hr_resize_width = args.hr_resize_width
         hr_resize_height = args.hr_resize_height
-        folder = (f"attn_{attention_type}_fp16"
-                  if args.use_fp16 else f"attn_{attention_type}_fp32")
+        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
         os.makedirs(folder, exist_ok=True)
         if args.task_name in ["text2img", "all"]:
             # text2img
@@ -183,7 +169,8 @@ def main(args):
                 num_inference_steps=10,
                 height=height,
                 width=width,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print("==> Test text2img performance.")
             paddle.seed(seed)
             for step in trange(args.benchmark_steps):
@@ -193,7 +180,8 @@ def main(args):
                     num_inference_steps=args.inference_steps,
                     height=height,
                     width=width,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -216,7 +204,8 @@ def main(args):
                 num_inference_steps=20,
                 height=height,
                 width=width,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print("==> Test img2img performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -227,7 +216,8 @@ def main(args):
                     num_inference_steps=args.inference_steps,
                     height=height,
                     width=width,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -238,7 +228,9 @@ def main(args):
             images[0].save(f"{folder}/img2img.png")
 
         if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
             mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
             init_image = load_image(img_url)
             mask_image = load_image(mask_url)
@@ -263,7 +255,8 @@ def main(args):
                 num_inference_steps=20,
                 height=height,
                 width=width,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print(f"==> Test {task_name} performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -275,7 +268,8 @@ def main(args):
                     num_inference_steps=args.inference_steps,
                     height=height,
                     width=width,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -288,7 +282,9 @@ def main(args):
 
         if args.task_name in ["cycle_diffusion", "all"]:
             pipe.change_scheduler("ddim")
-            image_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
+            image_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
+            )
             init_image = load_image(image_url)
             source_prompt = "An astronaut riding a horse"
             prompt = "An astronaut riding an elephant"
@@ -305,7 +301,8 @@ def main(args):
                 source_guidance_scale=1,
                 height=height,
                 width=width,
-                parse_prompt_type=parse_prompt_type, ).images[0]
+                parse_prompt_type=parse_prompt_type,
+            ).images[0]
             print("==> Test cycle diffusion performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -321,7 +318,8 @@ def main(args):
                     source_guidance_scale=1,
                     height=height,
                     width=width,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -345,7 +343,8 @@ def main(args):
                 hr_resize_width=hr_resize_width,
                 hr_resize_height=hr_resize_height,
                 enable_hr=True,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
             print("==> Test hiresfix performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -359,7 +358,8 @@ def main(args):
                     hr_resize_width=hr_resize_width,
                     hr_resize_height=hr_resize_height,
                     enable_hr=True,
-                    parse_prompt_type=parse_prompt_type, ).images
+                    parse_prompt_type=parse_prompt_type,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/infer_dygraph_torch.py b/ppdiffusers/deploy/infer_dygraph_torch.py
index fad812b22d8df..0f50cdd5a7502 100644
--- a/ppdiffusers/deploy/infer_dygraph_torch.py
+++ b/ppdiffusers/deploy/infer_dygraph_torch.py
@@ -18,19 +18,27 @@
 
 import torch
 
-torch.nn.functional.scaled_dot_product_attention_ = (
-    torch.nn.functional.scaled_dot_product_attention)
+torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
 delattr(torch.nn.functional, "scaled_dot_product_attention")
 import numpy as np
 from diffusers import (
-    CycleDiffusionPipeline, DDIMScheduler, DDPMScheduler,
-    DEISMultistepScheduler, DiffusionPipeline, DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler, EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler, HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler, PNDMScheduler, UniPCMultistepScheduler)
-from diffusers.models.attention_processor import (AttnProcessor,
-                                                  AttnProcessor2_0)
+    CycleDiffusionPipeline,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
 from diffusers.utils import load_image
 from tqdm.auto import trange
 
@@ -52,46 +60,40 @@ def change_scheduler(self, scheduler_type="ddim"):
     self.orginal_scheduler_config = self.scheduler.config
     scheduler_type = scheduler_type.lower()
     if scheduler_type == "pndm":
-        scheduler = PNDMScheduler.from_config(
-            self.orginal_scheduler_config, skip_prk_steps=True)
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
     elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "dpm-multi":
-        scheduler = DPMSolverMultistepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "dpm-single":
-        scheduler = DPMSolverSinglestepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "kdpm2-ancestral":
-        scheduler = KDPM2AncestralDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "kdpm2":
-        scheduler = KDPM2DiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "unipc-multi":
-        scheduler = UniPCMultistepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "ddim":
         scheduler = DDIMScheduler.from_config(
             self.orginal_scheduler_config,
             steps_offset=1,
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
     elif scheduler_type == "ddpm":
-        scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, )
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
     elif scheduler_type == "deis-multi":
         scheduler = DEISMultistepScheduler.from_config(
-            self.orginal_scheduler_config, )
+            self.orginal_scheduler_config,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
     return scheduler
@@ -103,17 +105,20 @@ def parse_arguments():
     parser.add_argument(
         "--pretrained_model_name_or_path",
         default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=10,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--task_name",
         type=str,
@@ -136,20 +141,17 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
     parser.add_argument(
         "--channels_last",
         type=strtobool,
         default=False,
-        help="Wheter to use channels_last", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
     parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
-    parser.add_argument(
-        "--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
     parser.add_argument(
         "--attention_type",
         type=str,
@@ -158,12 +160,9 @@ def parse_arguments():
             "raw",
             "sdp",
         ],
-        help="attention_type.", )
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -183,11 +182,10 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
     return parser.parse_args()
 
 
@@ -246,8 +244,8 @@ def main(args):
         feature_extractor=None,
         requires_safety_checker=False,
         torch_dtype=torch_dtype,
-        custom_pipeline="stable_diffusion_mega"
-        if args.parse_prompt_type == "raw" else "lpw_stable_diffusion", )
+        custom_pipeline="stable_diffusion_mega" if args.parse_prompt_type == "raw" else "lpw_stable_diffusion",
+    )
     scheduler = change_scheduler(pipe, args.scheduler)
     pipe.scheduler = scheduler
     if args.device_id >= 0:
@@ -259,11 +257,9 @@ def main(args):
         args.attention_type = [args.attention_type]
 
     for attention_type in args.attention_type:
-        attn_prrocessor_cls = (AttnProcessor
-                               if attention_type == "raw" else AttnProcessor2_0)
+        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
         if attention_type == "sdp":
-            torch.nn.functional.scaled_dot_product_attention = (
-                torch.nn.functional.scaled_dot_product_attention_)
+            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
 
         set_attn_processor(pipe.unet, attn_prrocessor_cls())
         set_attn_processor(pipe.vae, attn_prrocessor_cls())
@@ -272,15 +268,13 @@ def main(args):
 
         if args.compile:
             print("Run torch compile")
-            pipe.unet = torch.compile(
-                pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 
         width = args.width
         height = args.height
         pipe.set_progress_bar_config(disable=True)
 
-        folder = (f"torch_attn_{attention_type}_fp16"
-                  if args.use_fp16 else f"torch_attn_{attention_type}_fp32")
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
         os.makedirs(folder, exist_ok=True)
         if args.task_name in ["text2img", "all"]:
             # text2img
@@ -291,7 +285,8 @@ def main(args):
                 prompt,
                 num_inference_steps=10,
                 height=height,
-                width=width, )
+                width=width,
+            )
             print("==> Test text2img performance.")
             torch.cuda.manual_seed(seed)
             for step in trange(args.benchmark_steps):
@@ -300,7 +295,8 @@ def main(args):
                     prompt,
                     num_inference_steps=args.inference_steps,
                     height=height,
-                    width=width, ).images
+                    width=width,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -322,7 +318,8 @@ def main(args):
                 image=init_image,
                 num_inference_steps=20,
                 height=height,
-                width=width, )
+                width=width,
+            )
             print("==> Test img2img performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -332,7 +329,8 @@ def main(args):
                     image=init_image,
                     num_inference_steps=args.inference_steps,
                     height=height,
-                    width=width, ).images
+                    width=width,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -343,7 +341,9 @@ def main(args):
             images[0].save(f"{folder}/img2img.png")
 
         if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
             mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
             init_image = load_image(img_url).resize((width, height))
             mask_image = load_image(mask_url).resize((width, height))
@@ -365,7 +365,8 @@ def main(args):
                 prompt,
                 image=init_image,
                 mask_image=mask_image,
-                num_inference_steps=20, )
+                num_inference_steps=20,
+            )
             print(f"==> Test {task_name} performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -374,7 +375,8 @@ def main(args):
                     prompt,
                     image=init_image,
                     mask_image=mask_image,
-                    num_inference_steps=args.inference_steps, ).images
+                    num_inference_steps=args.inference_steps,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
@@ -395,7 +397,8 @@ def main(args):
                 scheduler=scheduler,
                 safety_checker=None,
                 feature_extractor=None,
-                requires_safety_checker=False, )
+                requires_safety_checker=False,
+            )
             cycle_pipe.set_progress_bar_config(disable=True)
             scheduler = change_scheduler(cycle_pipe, "ddim")
             cycle_pipe.scheduler = scheduler
@@ -413,7 +416,8 @@ def main(args):
                 eta=0.1,
                 strength=0.8,
                 guidance_scale=2,
-                source_guidance_scale=1, ).images[0]
+                source_guidance_scale=1,
+            ).images[0]
             print("==> Test cycle diffusion performance.")
             for step in trange(args.benchmark_steps):
                 start = time.time()
@@ -426,7 +430,8 @@ def main(args):
                     eta=0.1,
                     strength=0.8,
                     guidance_scale=2,
-                    source_guidance_scale=1, ).images
+                    source_guidance_scale=1,
+                ).images
                 latency = time.time() - start
                 time_costs += [latency]
                 # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py b/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py
index 3fe17fd46e9c8..d26495eaa34ba 100644
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py
+++ b/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py
@@ -20,21 +20,23 @@
 import paddle
 
 from ppdiffusers import (
-    FastDeployRuntimeModel, FastDeployStableDiffusionImageVariationPipeline,
-    StableDiffusionImageVariationPipeline, UNet2DConditionModel)
+    FastDeployRuntimeModel,
+    FastDeployStableDiffusionImageVariationPipeline,
+    StableDiffusionImageVariationPipeline,
+    UNet2DConditionModel,
+)
 
 
 def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        model_path: str,
-        output_path: str,
-        sample: bool=False,
-        height: int=None,
-        width: int=None, ):
+    model_path: str,
+    output_path: str,
+    sample: bool = False,
+    height: int = None,
+    width: int = None,
+):
     # specify unet model with unet pre_temb_act opt enabled.
-    unet_model = UNet2DConditionModel.from_pretrained(
-        model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
-    pipeline = StableDiffusionImageVariationPipeline.from_pretrained(
-        model_path, unet=unet_model, safety_checker=None)
+    unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
+    pipeline = StableDiffusionImageVariationPipeline.from_pretrained(model_path, unet=unet_model, safety_checker=None)
     # make sure we disable xformers
     pipeline.disable_xformers_memory_efficient_attention()
     output_path = Path(output_path)
@@ -42,8 +44,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
     latent_height = height // 8 if height is not None else None
     latent_width = width // 8 if width is not None else None
     # get arguments
-    cross_attention_dim = (
-        pipeline.unet.config.cross_attention_dim)  # 768 or 1024 or 1280
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
     unet_channels = pipeline.unet.config.in_channels  # 4 or 9
     vae_in_channels = pipeline.vae.config.in_channels  # 3
     vae_latent_channels = pipeline.vae.config.latent_channels  # 4
@@ -51,13 +52,13 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
         f"cross_attention_dim: {cross_attention_dim}\n",
         f"unet_in_channels: {unet_channels}\n",
         f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}", )
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
     # 1. Convert image_encoder
     image_encoder = paddle.jit.to_static(
         pipeline.image_encoder,
         input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, 3, 224, 224], dtype="float32", name="pixel_values")
+            paddle.static.InputSpec(shape=[None, 3, 224, 224], dtype="float32", name="pixel_values")
         ],  # pixel_values
     )
     save_path = os.path.join(args.output_path, "image_encoder", "inference")
@@ -72,14 +73,16 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
             paddle.static.InputSpec(
                 shape=[None, unet_channels, latent_height, latent_width],
                 dtype="float32",
-                name="sample", ),  # sample
-            paddle.static.InputSpec(
-                shape=[1], dtype="float32", name="timestep"),  # timestep
+                name="sample",
+            ),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
             paddle.static.InputSpec(
                 shape=[None, None, cross_attention_dim],
                 dtype="float32",
-                name="encoder_hidden_states", ),  # encoder_hidden_states
-        ], )
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
+        ],
+    )
     save_path = os.path.join(args.output_path, "unet", "inference")
     paddle.jit.save(unet, save_path)
     print(f"Save unet model in {save_path} successfully.")
@@ -94,8 +97,7 @@ def forward_vae_encoder_sample(self, z):
     # 3. Convert vae encoder
     vae_encoder = pipeline.vae
     if sample:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample,
-                                         vae_encoder)
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
     else:
         vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
 
@@ -107,7 +109,8 @@ def forward_vae_encoder_sample(self, z):
                 dtype="float32",
                 name="sample",  # N, C, H, W
             ),  # latent
-        ], )
+        ],
+    )
     # Save vae_encoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_encoder", "inference")
     paddle.jit.save(vae_encoder, save_path)
@@ -126,8 +129,10 @@ def forward_vae_decoder(self, z):
             paddle.static.InputSpec(
                 shape=[None, vae_latent_channels, latent_height, latent_width],
                 dtype="float32",
-                name="latent_sample", ),  # latent_sample
-        ], )
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
     # Save vae_decoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_decoder", "inference")
     paddle.jit.save(vae_decoder, save_path)
@@ -137,17 +142,15 @@ def forward_vae_decoder(self, z):
     fd_pipe_cls = FastDeployStableDiffusionImageVariationPipeline
 
     fastdeploy_pipeline = fd_pipe_cls(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_decoder"),
-        image_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                             "image_encoder"),
+        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        image_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "image_encoder"),
         unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
         scheduler=pipeline.scheduler,
         safety_checker=None,
         feature_extractor=pipeline.feature_extractor,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     fastdeploy_pipeline.save_pretrained(output_path)
     print("FastDeploy pipeline saved to", output_path)
 
@@ -161,26 +164,25 @@ def forward_vae_decoder(self, z):
         required=True,
         help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
     )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Path to the output model.")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
     parser.add_argument(
         "--sample",
         action="store_true",
         default=False,
-        help="Export the vae encoder in mode or sample", )
+        help="Export the vae encoder in mode or sample",
+    )
     parser.add_argument(
         "--height",
         type=int,
         default=None,
-        help="The height of output images. Default: None", )
+        help="The height of output images. Default: None",
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
-        help="The width of output images. Default: None", )
+        help="The width of output images. Default: None",
+    )
     args = parser.parse_args()
 
     convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
@@ -188,4 +190,5 @@ def forward_vae_decoder(self, z):
         args.output_path,
         args.sample,
         args.height,
-        args.width, )
+        args.width,
+    )
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py
index 55b908c4787b2..dcb4b78edb046 100644
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py
+++ b/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py
@@ -32,17 +32,20 @@ def parse_arguments():
     parser.add_argument(
         "--model_dir",
         default="lambdalabs/sd-image-variations-diffusers@fastdeploy",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=1,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--backend",
         type=str,
@@ -62,7 +65,8 @@ def parse_arguments():
             "huawei_ascend_npu",
             "kunlunxin_xpu",
         ],
-        help="The inference runtime device of models.", )
+        help="The inference runtime device of models.",
+    )
     parser.add_argument(
         "--parse_prompt_type",
         type=str,
@@ -71,22 +75,11 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
-    parser.add_argument(
-        "--use_bf16",
-        type=strtobool,
-        default=False,
-        help="Wheter to use BF16 mode")
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode")
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -107,7 +100,8 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
+        help="The scheduler type of stable diffusion.",
+    )
     parser.add_argument(
         "--infer_op",
         type=str,
@@ -117,23 +111,13 @@ def parse_arguments():
             "raw",
             "all",
         ],
-        help="The type of infer op.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
-    parser.add_argument(
-        "--hr_resize_height",
-        type=int,
-        default=768,
-        help="HR Height of input image")
-    parser.add_argument(
-        "--hr_resize_width",
-        type=int,
-        default=768,
-        help="HR Width of input image")
-    parser.add_argument(
-        "--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+        help="The type of infer op.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
 
     return parser.parse_args()
 
@@ -149,15 +133,16 @@ def create_ort_runtime(device_id=0):
 
 
 def create_paddle_inference_runtime(
-        use_trt=False,
-        dynamic_shape=None,
-        use_fp16=False,
-        use_bf16=False,
-        device_id=0,
-        disable_paddle_trt_ops=[],
-        disable_paddle_pass=[],
-        paddle_stream=None,
-        workspace=None, ):
+    use_trt=False,
+    dynamic_shape=None,
+    use_fp16=False,
+    use_bf16=False,
+    device_id=0,
+    disable_paddle_trt_ops=[],
+    disable_paddle_pass=[],
+    paddle_stream=None,
+    workspace=None,
+):
     assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive"
     option = fd.RuntimeOption()
     option.use_paddle_backend()
@@ -198,7 +183,8 @@ def create_paddle_inference_runtime(
                     key,
                     shape_dict["min_shape"],
                     shape_dict.get("opt_shape", None),
-                    shape_dict.get("max_shape", None), )
+                    shape_dict.get("max_shape", None),
+                )
     return option
 
 
@@ -209,8 +195,10 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
         option.use_ascend()
         option.set_lite_device_names(["huawei_ascend_npu"])
         option.set_lite_context_properties(
-            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".
-            format(device_id))
+            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
+                device_id
+            )
+        )
     elif device == "kunlunxin_xpu":
         # TODO(shentanyue): Add kunlunxin_xpu code
         # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
@@ -222,7 +210,8 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
             autotune_file="",
             precision="int16",
             adaptive_seqlen=True,
-            enable_multi_stream=True, )
+            enable_multi_stream=True,
+        )
         if use_fp16:
             option.enable_lite_fp16()
     else:
@@ -230,10 +219,7 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
     return option
 
 
-def create_trt_runtime(workspace=(1 << 31),
-                       dynamic_shape=None,
-                       use_fp16=False,
-                       device_id=0):
+def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
     option = fd.RuntimeOption()
     option.use_trt_backend()
     option.use_gpu(device_id)
@@ -247,7 +233,8 @@ def create_trt_runtime(workspace=(1 << 31),
                 key,
                 min_shape=shape_dict["min_shape"],
                 opt_shape=shape_dict.get("opt_shape", None),
-                max_shape=shape_dict.get("max_shape", None), )
+                max_shape=shape_dict.get("max_shape", None),
+            )
     return option
 
 
@@ -257,8 +244,7 @@ def main(args):
         paddle_stream = None
     else:
         paddle.set_device(f"gpu:{args.device_id}")
-        paddle_stream = paddle.device.cuda.current_stream(
-            args.device_id).cuda_stream
+        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
 
     seed = 1024
     vae_in_channels = 4
@@ -286,12 +272,9 @@ def main(args):
 
     vae_decoder_dynamic_shape = {
         "latent_sample": {
-            "min_shape":
-            [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape":
-            [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape":
-            [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
+            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
         }
     }
 
@@ -333,37 +316,38 @@ def main(args):
             text_encoder=create_ort_runtime(device_id=args.device_id),
             vae_encoder=create_ort_runtime(device_id=args.device_id),
             vae_decoder=create_ort_runtime(device_id=args.device_id),
-            unet=create_ort_runtime(device_id=args.device_id), )
+            unet=create_ort_runtime(device_id=args.device_id),
+        )
     elif args.backend == "paddlelite":
         runtime_options = dict(
-            text_encoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_encoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_decoder=create_paddle_lite_runtime(
-                device=args.device, device_id=args.device_id, use_fp16=False),
-            unet=create_paddle_lite_runtime(
-                device=args.device,
-                device_id=args.device_id,
-                use_fp16=args.use_fp16), )
+            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
+        )
     elif args.backend == "tensorrt":
         runtime_options = dict(
             image_encoder=create_trt_runtime(
                 dynamic_shape=image_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             vae_encoder=create_trt_runtime(
                 dynamic_shape=vae_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             vae_decoder=create_trt_runtime(
                 dynamic_shape=vae_decoder_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ),
+                device_id=args.device_id,
+            ),
             unet=create_trt_runtime(
                 dynamic_shape=unet_dynamic_shape,
                 use_fp16=args.use_fp16,
-                device_id=args.device_id, ), )
+                device_id=args.device_id,
+            ),
+        )
     elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
         args.use_trt = args.backend == "paddle_tensorrt"
         runtime_options = dict(
@@ -374,31 +358,37 @@ def main(args):
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
                 disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             vae_encoder=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=vae_encoder_dynamic_shape,
                 use_fp16=args.use_fp16,
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             vae_decoder=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=vae_decoder_dynamic_shape,
                 use_fp16=args.use_fp16,
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ),
+                paddle_stream=paddle_stream,
+            ),
             unet=create_paddle_inference_runtime(
                 use_trt=args.use_trt,
                 dynamic_shape=unet_dynamic_shape,
                 use_fp16=args.use_fp16,
                 use_bf16=args.use_bf16,
                 device_id=args.device_id,
-                paddle_stream=paddle_stream, ), )
+                paddle_stream=paddle_stream,
+            ),
+        )
     pipe = FastDeployStableDiffusionImageVariationPipeline.from_pretrained(
         args.model_dir,
-        runtime_options=runtime_options, )
+        runtime_options=runtime_options,
+    )
     pipe.set_progress_bar_config(disable=True)
     pipe.change_scheduler(args.scheduler)
     # parse_prompt_type = args.parse_prompt_type
@@ -412,9 +402,7 @@ def main(args):
     else:
         infer_op_list = [args.infer_op]
     if args.device == "kunlunxin_xpu" or args.backend == "paddle":
-        print(
-            "When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op."
-        )
+        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
         infer_op_list = ["raw"]
 
     for infer_op in infer_op_list:
@@ -424,12 +412,13 @@ def main(args):
             "image_encoder": infer_op,
             "unet": infer_op,
         }
-        folder = (f"infer_op_{infer_op}_fp16"
-                  if args.use_fp16 else f"infer_op_{infer_op}_fp32")
+        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
         os.makedirs(folder, exist_ok=True)
 
         # image_variation
-        img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        img_url = (
+            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        )
         init_image = load_image(img_url)
         time_costs = []
         # warmup
@@ -438,7 +427,8 @@ def main(args):
             num_inference_steps=20,
             height=height,
             width=width,
-            infer_op_dict=infer_op_dict, )
+            infer_op_dict=infer_op_dict,
+        )
         print("==> Test image_variation performance.")
         for step in trange(args.benchmark_steps):
             start = time.time()
@@ -448,7 +438,8 @@ def main(args):
                 num_inference_steps=args.inference_steps,
                 height=height,
                 width=width,
-                infer_op_dict=infer_op_dict, ).images
+                infer_op_dict=infer_op_dict,
+            ).images
             latency = time.time() - start
             time_costs += [latency]
             # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py
index 0b9e67bda034b..17b2290357f54 100644
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py
+++ b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py
@@ -35,17 +35,20 @@ def parse_arguments():
     parser.add_argument(
         "--model_dir",
         default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=1,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--parse_prompt_type",
         type=str,
@@ -54,37 +57,21 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
     parser.add_argument(
         "--attention_type",
         type=str,
         default="raw",
         choices=["raw", "cutlass", "flash", "all"],
-        help="attention_type.", )
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
-    parser.add_argument(
-        "--hr_resize_height",
-        type=int,
-        default=768,
-        help="HR Height of input image")
-    parser.add_argument(
-        "--hr_resize_width",
-        type=int,
-        default=768,
-        help="HR Width of input image")
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
     return parser.parse_args()
 
 
@@ -99,7 +86,8 @@ def main(args):
     pipe = StableDiffusionImageVariationPipeline.from_pretrained(
         args.model_dir,
         safety_checker=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     pipe.set_progress_bar_config(disable=True)
     # parse_prompt_type = args.parse_prompt_type
     if args.attention_type == "all":
@@ -126,12 +114,13 @@ def main(args):
         height = args.height
         # hr_resize_width = args.hr_resize_width
         # hr_resize_height = args.hr_resize_height
-        folder = (f"attn_{attention_type}_fp16"
-                  if args.use_fp16 else f"attn_{attention_type}_fp32")
+        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
         os.makedirs(folder, exist_ok=True)
 
         # image_variation
-        img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        img_url = (
+            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        )
         init_image = load_image(img_url)
         time_costs = []
         # warmup
@@ -139,7 +128,8 @@ def main(args):
             image=init_image,
             num_inference_steps=20,
             height=height,
-            width=width, )
+            width=width,
+        )
         print("==> Test image_variation performance.")
         for step in trange(args.benchmark_steps):
             start = time.time()
@@ -148,7 +138,8 @@ def main(args):
                 image=init_image,
                 num_inference_steps=args.inference_steps,
                 height=height,
-                width=width, ).images
+                width=width,
+            ).images
             latency = time.time() - start
             time_costs += [latency]
             # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py
index 126e4f0819e6c..fb1530d071d21 100644
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py
+++ b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py
@@ -18,19 +18,26 @@
 
 import torch
 
-torch.nn.functional.scaled_dot_product_attention_ = (
-    torch.nn.functional.scaled_dot_product_attention)
+torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
 delattr(torch.nn.functional, "scaled_dot_product_attention")
 import numpy as np
 from diffusers import (
-    DDIMScheduler, DDPMScheduler, DEISMultistepScheduler,
-    DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
-    StableDiffusionImageVariationPipeline, UniPCMultistepScheduler)
-from diffusers.models.attention_processor import (AttnProcessor,
-                                                  AttnProcessor2_0)
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImageVariationPipeline,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
 from diffusers.utils import load_image
 from tqdm.auto import trange
 
@@ -52,46 +59,40 @@ def change_scheduler(self, scheduler_type="ddim"):
     self.orginal_scheduler_config = self.scheduler.config
     scheduler_type = scheduler_type.lower()
     if scheduler_type == "pndm":
-        scheduler = PNDMScheduler.from_config(
-            self.orginal_scheduler_config, skip_prk_steps=True)
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
     elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "dpm-multi":
-        scheduler = DPMSolverMultistepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "dpm-single":
-        scheduler = DPMSolverSinglestepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "kdpm2-ancestral":
-        scheduler = KDPM2AncestralDiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "kdpm2":
-        scheduler = KDPM2DiscreteScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "unipc-multi":
-        scheduler = UniPCMultistepScheduler.from_config(
-            self.orginal_scheduler_config)
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
     elif scheduler_type == "ddim":
         scheduler = DDIMScheduler.from_config(
             self.orginal_scheduler_config,
             steps_offset=1,
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
     elif scheduler_type == "ddpm":
-        scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, )
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
     elif scheduler_type == "deis-multi":
         scheduler = DEISMultistepScheduler.from_config(
-            self.orginal_scheduler_config, )
+            self.orginal_scheduler_config,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
     return scheduler
@@ -103,17 +104,20 @@ def parse_arguments():
     parser.add_argument(
         "--pretrained_model_name_or_path",
         default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.", )
+        help="The model directory of diffusion_model.",
+    )
     parser.add_argument(
         "--inference_steps",
         type=int,
         default=50,
-        help="The number of unet inference steps.", )
+        help="The number of unet inference steps.",
+    )
     parser.add_argument(
         "--benchmark_steps",
         type=int,
         default=10,
-        help="The number of performance benchmark steps.", )
+        help="The number of performance benchmark steps.",
+    )
     parser.add_argument(
         "--parse_prompt_type",
         type=str,
@@ -122,20 +126,17 @@ def parse_arguments():
             "raw",
             "lpw",
         ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ", )
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
     parser.add_argument(
         "--channels_last",
         type=strtobool,
         default=False,
-        help="Wheter to use channels_last", )
-    parser.add_argument(
-        "--use_fp16",
-        type=strtobool,
-        default=True,
-        help="Wheter to use FP16 mode")
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
     parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
-    parser.add_argument(
-        "--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
     parser.add_argument(
         "--attention_type",
         type=str,
@@ -144,12 +145,9 @@ def parse_arguments():
             "raw",
             "sdp",
         ],
-        help="attention_type.", )
-    parser.add_argument(
-        "--device_id",
-        type=int,
-        default=0,
-        help="The selected gpu id. -1 means use cpu")
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
     parser.add_argument(
         "--scheduler",
         type=str,
@@ -169,11 +167,10 @@ def parse_arguments():
             "kdpm2-ancestral",
             "kdpm2",
         ],
-        help="The scheduler type of stable diffusion.", )
-    parser.add_argument(
-        "--height", type=int, default=512, help="Height of input image")
-    parser.add_argument(
-        "--width", type=int, default=512, help="Width of input image")
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
     return parser.parse_args()
 
 
@@ -230,7 +227,8 @@ def main(args):
         args.pretrained_model_name_or_path,
         safety_checker=None,
         requires_safety_checker=False,
-        torch_dtype=torch_dtype, )
+        torch_dtype=torch_dtype,
+    )
     scheduler = change_scheduler(pipe, args.scheduler)
     pipe.scheduler = scheduler
     if args.device_id >= 0:
@@ -242,11 +240,9 @@ def main(args):
         args.attention_type = [args.attention_type]
 
     for attention_type in args.attention_type:
-        attn_prrocessor_cls = (AttnProcessor
-                               if attention_type == "raw" else AttnProcessor2_0)
+        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
         if attention_type == "sdp":
-            torch.nn.functional.scaled_dot_product_attention = (
-                torch.nn.functional.scaled_dot_product_attention_)
+            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
 
         set_attn_processor(pipe.unet, attn_prrocessor_cls())
         set_attn_processor(pipe.vae, attn_prrocessor_cls())
@@ -255,19 +251,19 @@ def main(args):
 
         if args.compile:
             print("Run torch compile")
-            pipe.unet = torch.compile(
-                pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 
         width = args.width
         height = args.height
         pipe.set_progress_bar_config(disable=True)
 
-        folder = (f"torch_attn_{attention_type}_fp16"
-                  if args.use_fp16 else f"torch_attn_{attention_type}_fp32")
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
         os.makedirs(folder, exist_ok=True)
 
         # image_vairation
-        img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        img_url = (
+            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+        )
         init_image = load_image(img_url).resize((width, height))
         time_costs = []
         # warmup
@@ -275,7 +271,8 @@ def main(args):
             image=init_image,
             num_inference_steps=20,
             height=height,
-            width=width, )
+            width=width,
+        )
         print("==> Test image_vairation performance.")
         for step in trange(args.benchmark_steps):
             start = time.time()
@@ -284,7 +281,8 @@ def main(args):
                 image=init_image,
                 num_inference_steps=args.inference_steps,
                 height=height,
-                width=width, ).images
+                width=width,
+            ).images
             latency = time.time() - start
             time_costs += [latency]
             # print(f"No {step:3d} time cost: {latency:2f} s")
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/app.py b/ppdiffusers/examples/Stable-CycleDiffusion/app.py
index 121d115b10745..705f42d3d6fa3 100644
--- a/ppdiffusers/examples/Stable-CycleDiffusion/app.py
+++ b/ppdiffusers/examples/Stable-CycleDiffusion/app.py
@@ -37,7 +37,8 @@
 pipe = CycleDiffusionPipeline.from_pretrained(
     model_id_or_path,
     use_auth_token=os.environ.get("USER_TOKEN"),
-    paddle_dtype=paddle_dtype, )
+    paddle_dtype=paddle_dtype,
+)
 pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
 tokenizer = pipe.tokenizer
 
@@ -45,17 +46,11 @@
 class LocalBlend:
     def __call__(self, x_t, attention_store):
         k = 1
-        maps = attention_store["down_cross"][2:4] + attention_store[
-            "up_cross"][:3]
-        maps = [
-            item.reshape(
-                [self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS])
-            for item in maps
-        ]
+        maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
+        maps = [item.reshape([self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS]) for item in maps]
         maps = paddle.concat(maps, axis=1)
         maps = (maps * self.alpha_layers).sum(-1).mean(1)
-        mask = F.max_pool2d(
-            maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
+        mask = F.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
         mask = F.interpolate(mask, size=(x_t.shape[2:]))
         mask = mask / mask.max(2, keepdim=True)[0].max(3, keepdim=True)[0]
         mask = mask > self.threshold
@@ -150,8 +145,7 @@ def between_steps(self):
 
     def get_average_attention(self):
         average_attention = {
-            key: [item / self.cur_step for item in self.attention_store[key]]
-            for key in self.attention_store
+            key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
         }
         return average_attention
 
@@ -174,8 +168,7 @@ def step_callback(self, x_t):
 
     def replace_self_attention(self, attn_base, att_replace):
         if att_replace.shape[2] <= 16**2:
-            return attn_base.unsqueeze(0).expand(
-                [att_replace.shape[0], *attn_base.shape])
+            return attn_base.unsqueeze(0).expand([att_replace.shape[0], *attn_base.shape])
         else:
             return att_replace
 
@@ -185,36 +178,35 @@ def replace_cross_attention(self, attn_base, att_replace):
 
     def forward(self, attn, is_cross: bool, place_in_unet: str):
         super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
-        if is_cross or (self.num_self_replace[0] <= self.cur_step <
-                        self.num_self_replace[1]):
+        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
             attn_base, attn_repalce = attn[0], attn[1:]
             if is_cross:
                 alpha_words = self.cross_replace_alpha[self.cur_step]
-                attn_replace_new = (self.replace_cross_attention(
-                    attn_base, attn_repalce) * alpha_words +
-                                    (1 - alpha_words) * attn_repalce)
+                attn_replace_new = (
+                    self.replace_cross_attention(attn_base, attn_repalce) * alpha_words
+                    + (1 - alpha_words) * attn_repalce
+                )
                 attn[1:] = attn_replace_new
             else:
                 attn[1:] = self.replace_self_attention(attn_base, attn_repalce)
         return attn
 
     def __init__(
-            self,
-            prompts,
-            num_steps: int,
-            cross_replace_steps: Union[float, Tuple[float, float], Dict[
-                str, Tuple[float, float]]],
-            self_replace_steps: Union[float, Tuple[float, float]],
-            local_blend: Optional[LocalBlend], ):
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+        self_replace_steps: Union[float, Tuple[float, float]],
+        local_blend: Optional[LocalBlend],
+    ):
         super(AttentionControlEdit, self).__init__()
         self.batch_size = len(prompts)
         self.cross_replace_alpha = ptp_utils.get_time_words_attention_alpha(
-            prompts, num_steps, cross_replace_steps,
-            tokenizer).cast(paddle_dtype)
+            prompts, num_steps, cross_replace_steps, tokenizer
+        ).cast(paddle_dtype)
         if type(self_replace_steps) is float or type(self_replace_steps) is int:
             self_replace_steps = 0, self_replace_steps
-        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(
-            num_steps * self_replace_steps[1])
+        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
         self.local_blend = local_blend
 
 
@@ -223,17 +215,17 @@ def replace_cross_attention(self, attn_base, att_replace):
         return paddle.einsum("hpw,bwn->bhpn", attn_base, self.mapper)
 
     def __init__(
-            self,
-            prompts,
-            num_steps: int,
-            cross_replace_steps: float,
-            self_replace_steps: float,
-            local_blend: Optional[LocalBlend]=None, ):
-        super(AttentionReplace, self).__init__(prompts, num_steps,
-                                               cross_replace_steps,
-                                               self_replace_steps, local_blend)
-        self.mapper = seq_aligner.get_replacement_mapper(
-            prompts, tokenizer).cast(paddle_dtype)
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: float,
+        self_replace_steps: float,
+        local_blend: Optional[LocalBlend] = None,
+    ):
+        super(AttentionReplace, self).__init__(
+            prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend
+        )
+        self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).cast(paddle_dtype)
 
 
 class AttentionRefine(AttentionControlEdit):
@@ -243,35 +235,33 @@ def replace_cross_attention(self, attn_base, att_replace):
         # pt: a[:, :, b].shape = torch.Size([8, 4096, 1, 77])
         # pd: a.take_along_axis(b.unsqueeze(0), axis=-1).unsqueeze(-2)
 
-        attn_base_replace = (attn_base.take_along_axis(
-            self.mapper.unsqueeze(0), axis=-1).unsqueeze(-2)
-                             .transpose([2, 0, 1, 3]))
-        attn_replace = attn_base_replace * self.alphas + att_replace * (
-            1 - self.alphas)
+        attn_base_replace = (
+            attn_base.take_along_axis(self.mapper.unsqueeze(0), axis=-1).unsqueeze(-2).transpose([2, 0, 1, 3])
+        )
+        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
         return attn_replace
 
     def __init__(
-            self,
-            prompts,
-            num_steps: int,
-            cross_replace_steps: float,
-            self_replace_steps: float,
-            local_blend: Optional[LocalBlend]=None, ):
-        super(AttentionRefine, self).__init__(prompts, num_steps,
-                                              cross_replace_steps,
-                                              self_replace_steps, local_blend)
-        self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts,
-                                                                tokenizer)
+        self,
+        prompts,
+        num_steps: int,
+        cross_replace_steps: float,
+        self_replace_steps: float,
+        local_blend: Optional[LocalBlend] = None,
+    ):
+        super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer)
         alphas = alphas.cast(paddle_dtype)
         self.alphas = alphas.reshape([alphas.shape[0], 1, 1, alphas.shape[1]])
 
 
 def get_equalizer(
-        text: str,
-        word_select: Union[int, Tuple[int, ...]],
-        values: Union[List[float], Tuple[float, ...]], ):
+    text: str,
+    word_select: Union[int, Tuple[int, ...]],
+    values: Union[List[float], Tuple[float, ...]],
+):
     if type(word_select) is int or type(word_select) is str:
-        word_select = (word_select, )
+        word_select = (word_select,)
     equalizer = paddle.ones([len(values), 77])
     values = paddle.to_tensor(values, dtype=paddle_dtype)
     for word in word_select:
@@ -281,19 +271,20 @@ def get_equalizer(
 
 
 def inference(
-        source_prompt,
-        target_prompt,
-        source_guidance_scale=1,
-        guidance_scale=5,
-        num_inference_steps=100,
-        width=512,
-        height=512,
-        seed=0,
-        img=None,
-        strength=0.7,
-        cross_attention_control="None",
-        cross_replace_steps=0.8,
-        self_replace_steps=0.4, ):
+    source_prompt,
+    target_prompt,
+    source_guidance_scale=1,
+    guidance_scale=5,
+    num_inference_steps=100,
+    width=512,
+    height=512,
+    seed=0,
+    img=None,
+    strength=0.7,
+    cross_attention_control="None",
+    cross_replace_steps=0.8,
+    self_replace_steps=0.4,
+):
 
     paddle.seed(seed)
 
@@ -312,21 +303,22 @@ def inference(
             [source_prompt, target_prompt],
             num_inference_steps,
             cross_replace_steps=cross_replace_steps,
-            self_replace_steps=self_replace_steps, )
+            self_replace_steps=self_replace_steps,
+        )
         ptp_utils.register_attention_control(pipe, controller)
     elif cross_attention_control == "Refine":
         controller = AttentionRefine(
             [source_prompt, target_prompt],
             num_inference_steps,
             cross_replace_steps=cross_replace_steps,
-            self_replace_steps=self_replace_steps, )
+            self_replace_steps=self_replace_steps,
+        )
         ptp_utils.register_attention_control(pipe, controller)
     elif cross_attention_control == "None":
         controller = EmptyControl()
         ptp_utils.register_attention_control(pipe, controller)
     else:
-        raise ValueError("Unknown cross_attention_control: {}".format(
-            cross_attention_control))
+        raise ValueError("Unknown cross_attention_control: {}".format(cross_attention_control))
 
     with paddle.amp.auto_cast(True, level="O2"):
         results = pipe(
@@ -337,7 +329,8 @@ def inference(
             eta=0.1,
             strength=strength,
             guidance_scale=guidance_scale,
-            source_guidance_scale=source_guidance_scale, )
+            source_guidance_scale=source_guidance_scale,
+        )
     if pipe.safety_checker is None:
         return results.images[0]
     else:
@@ -354,7 +347,8 @@ def replace_nsfw_images(results):
 css = """.cycle-diffusion-div div{display:inline-flex;align-items:center;gap:.8rem;font-size:1.75rem}.cycle-diffusion-div div h1{font-weight:900;margin-bottom:7px}.cycle-diffusion-div p{margin-bottom:10px;font-size:94%}.cycle-diffusion-div p a{text-decoration:underline}.tabs{margin-top:0;margin-bottom:0}#gallery{min-height:20rem}
 """
 with gr.Blocks(css=css) as demo:
-    gr.HTML("""
+    gr.HTML(
+        """
             <div class="cycle-diffusion-div">
               <div>
                 <h1>CycleDiffusion with Stable Diffusion</h1>
@@ -370,9 +364,11 @@ def replace_nsfw_images(results):
               2. Click the "Run CycleDiffusion" button. <br>
               </p>
             </div>
-        """)
+        """
+    )
     with gr.Accordion("See Details", open=False):
-        gr.HTML("""
+        gr.HTML(
+            """
             <div class="cycle-diffusion-div">
               <p>
                 <b>How to use:</b> <br>
@@ -396,14 +392,14 @@ def replace_nsfw_images(results):
               1. 20s on A10G. <br>
               </p>
             </div>
-        """)
+        """
+        )
     with gr.Row():
 
         with gr.Column(scale=55):
             with gr.Group():
 
-                img = gr.Image(
-                    label="Input image", height=512, tool="editor", type="pil")
+                img = gr.Image(label="Input image", height=512, tool="editor", type="pil")
 
                 image_out = gr.Image(label="Output image", height=512)
                 # gallery = gr.Gallery(
@@ -422,7 +418,8 @@ def replace_nsfw_images(results):
                             label="Source guidance scale",
                             value=1,
                             minimum=1,
-                            maximum=10, )
+                            maximum=10,
+                        )
                     with gr.Row():
                         target_prompt = gr.Textbox(
                             label="Target prompt",
@@ -432,14 +429,16 @@ def replace_nsfw_images(results):
                             label="Target guidance scale",
                             value=5,
                             minimum=1,
-                            maximum=10, )
+                            maximum=10,
+                        )
                     with gr.Row():
                         strength = gr.Slider(
                             label="Strength",
                             value=0.7,
                             minimum=0.5,
                             maximum=1,
-                            step=0.01, )
+                            step=0.01,
+                        )
                     with gr.Row():
                         generate1 = gr.Button(value="Run CycleDiffusion")
 
@@ -449,7 +448,8 @@ def replace_nsfw_images(results):
                         cross_attention_control = gr.Radio(
                             label="CAC type",
                             choices=["None", "Replace", "Refine"],
-                            value="None", )
+                            value="None",
+                        )
                     with gr.Row():
                         # If not "None", the following two parameters will be used.
                         cross_replace_steps = gr.Slider(
@@ -457,13 +457,15 @@ def replace_nsfw_images(results):
                             value=0.8,
                             minimum=0.0,
                             maximum=1,
-                            step=0.01, )
+                            step=0.01,
+                        )
                         self_replace_steps = gr.Slider(
                             label="Self replace steps",
                             value=0.4,
                             minimum=0.0,
                             maximum=1,
-                            step=0.01, )
+                            step=0.01,
+                        )
                     with gr.Row():
                         generate2 = gr.Button(value="Run CycleDiffusion")
 
@@ -475,23 +477,13 @@ def replace_nsfw_images(results):
                             value=100,
                             minimum=25,
                             maximum=500,
-                            step=1, )
-                        width = gr.Slider(
-                            label="Width",
-                            value=512,
-                            minimum=512,
-                            maximum=1024,
-                            step=8)
-                        height = gr.Slider(
-                            label="Height",
-                            value=512,
-                            minimum=512,
-                            maximum=1024,
-                            step=8)
+                            step=1,
+                        )
+                        width = gr.Slider(label="Width", value=512, minimum=512, maximum=1024, step=8)
+                        height = gr.Slider(label="Height", value=512, minimum=512, maximum=1024, step=8)
 
                     with gr.Row():
-                        seed = gr.Slider(
-                            0, 2147483647, label="Seed", value=0, step=1)
+                        seed = gr.Slider(0, 2147483647, label="Seed", value=0, step=1)
                     with gr.Row():
                         generate3 = gr.Button(value="Run CycleDiffusion")
 
@@ -714,11 +706,14 @@ def replace_nsfw_images(results):
         ],
         image_out,
         inference,
-        cache_examples=True, )
+        cache_examples=True,
+    )
 
-    gr.Markdown("""
+    gr.Markdown(
+        """
       Space built with PPDiffusers 🧨 by PaddleNLP.
       [![Twitter Follow](https://img.shields.io/twitter/follow/ChenHenryWu?style=social)](https://twitter.com/ChenHenryWu)
-      """)
+      """
+    )
 
 demo.launch(debug=True, share=True, server_name="0.0.0.0", server_port=8581)
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
index d09df121e1427..15df9ac4402ff 100644
--- a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
+++ b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
@@ -22,13 +22,9 @@
 
 def register_attention_control(model, controller):
     def ca_forward(self, place_in_unet):
-        def forward(hidden_states,
-                    encoder_hidden_states=None,
-                    attention_mask=None,
-                    **cross_attention_kwargs):
+        def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
             batch_size, sequence_length, _ = hidden_states.shape
-            attention_mask = self.prepare_attention_mask(
-                attention_mask, sequence_length, batch_size)
+            attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
 
             query = self.to_q(hidden_states)
             query = self.head_to_batch_dim(query)
@@ -41,11 +37,9 @@ def forward(hidden_states,
             key = self.head_to_batch_dim(key)
             value = self.head_to_batch_dim(value)
 
-            attention_probs = self.get_attention_scores(query, key,
-                                                        attention_mask)
+            attention_probs = self.get_attention_scores(query, key, attention_mask)
 
-            attention_probs = controller(attention_probs, is_cross,
-                                         place_in_unet)
+            attention_probs = controller(attention_probs, is_cross, place_in_unet)
 
             hidden_states = paddle.matmul(attention_probs, value)
             hidden_states = self.batch_to_head_dim(hidden_states)
@@ -82,17 +76,12 @@ def register_recr(net_, count, place_in_unet):
 def get_word_inds(text: str, word_place: int, tokenizer):
     split_text = text.split(" ")
     if type(word_place) is str:
-        word_place = [
-            i for i, word in enumerate(split_text) if word_place == word
-        ]
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
     elif type(word_place) is int:
         word_place = [word_place]
     out = []
     if len(word_place) > 0:
-        words_encode = [
-            tokenizer.decode([item]).strip("#")
-            for item in tokenizer.encode(text).input_ids
-        ][1:-1]
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1]
         cur_len, ptr = 0, 0
 
         for i in range(len(words_encode)):
@@ -106,14 +95,14 @@ def get_word_inds(text: str, word_place: int, tokenizer):
 
 
 def update_alpha_time_word(
-        alpha,
-        bounds: Union[float, Tuple[float, float]],
-        prompt_ind: int,
-        word_inds: Optional[paddle.Tensor]=None, ):
+    alpha,
+    bounds: Union[float, Tuple[float, float]],
+    prompt_ind: int,
+    word_inds: Optional[paddle.Tensor] = None,
+):
     if type(bounds) is float or bounds == 0:
         bounds = 0, bounds
-    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] *
-                                                      alpha.shape[0])
+    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
     if word_inds is None:
         word_inds = paddle.arange(alpha.shape[2])
     alpha[:start, prompt_ind, word_inds] = 0
@@ -123,32 +112,26 @@ def update_alpha_time_word(
 
 
 def get_time_words_attention_alpha(
-        prompts,
-        num_steps,
-        cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[
-            float, float]]],
-        tokenizer,
-        max_num_words=77, ):
+    prompts,
+    num_steps,
+    cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+    tokenizer,
+    max_num_words=77,
+):
     if type(cross_replace_steps) is not dict:
         cross_replace_steps = {"default_": cross_replace_steps}
     if "default_" not in cross_replace_steps:
         cross_replace_steps["default_"] = (0.0, 1.0)
-    alpha_time_words = paddle.zeros(
-        [num_steps + 1, len(prompts) - 1, max_num_words])
+    alpha_time_words = paddle.zeros([num_steps + 1, len(prompts) - 1, max_num_words])
     for i in range(len(prompts) - 1):
-        alpha_time_words = update_alpha_time_word(
-            alpha_time_words, cross_replace_steps["default_"], i)
+        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], i)
     for key, item in cross_replace_steps.items():
         if key != "default_":
-            inds = [
-                get_word_inds(prompts[i], key, tokenizer)
-                for i in range(1, len(prompts))
-            ]
+            inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
             for i, ind in enumerate(inds):
                 if len(ind) > 0:
-                    alpha_time_words = update_alpha_time_word(alpha_time_words,
-                                                              item, i, ind)
+                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
     alpha_time_words = alpha_time_words.reshape(
-        [num_steps + 1, len(prompts) - 1, 1, 1,
-         max_num_words])  # time, batch, heads, pixels, words
+        [num_steps + 1, len(prompts) - 1, 1, 1, max_num_words]
+    )  # time, batch, heads, pixels, words
     return alpha_time_words
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
index e1b1bc7bb6ccf..24c30b91e7f7d 100644
--- a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
+++ b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
@@ -66,8 +66,7 @@ def global_align(x, y, score):
         for j in range(1, len(y) + 1):
             left = matrix[i, j - 1] + score.gap
             up = matrix[i - 1, j] + score.gap
-            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1],
-                                                               y[j - 1])
+            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
             matrix[i, j] = max(left, up, diag)
             if matrix[i, j] == left:
                 trace_back[i, j] = 1
@@ -112,14 +111,20 @@ def get_mapper(x: str, y: str, tokenizer, max_len=77):
     score = ScoreParams(0, 1, -1)
     matrix, trace_back = global_align(x_seq, y_seq, score)
     mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
-    alphas = paddle.ones([max_len, ])
-    alphas[:mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32")
+    alphas = paddle.ones(
+        [
+            max_len,
+        ]
+    )
+    alphas[: mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32")
     mapper = paddle.zeros(
-        [max_len, ],
-        dtype=paddle.int64, )
-    mapper[:mapper_base.shape[0]] = mapper_base[:, 1]
-    mapper[mapper_base.shape[0]:] = len(y_seq) + paddle.arange(
-        max_len - len(y_seq), dtype="int64")
+        [
+            max_len,
+        ],
+        dtype=paddle.int64,
+    )
+    mapper[: mapper_base.shape[0]] = mapper_base[:, 1]
+    mapper[mapper_base.shape[0] :] = len(y_seq) + paddle.arange(max_len - len(y_seq), dtype="int64")
     return mapper, alphas
 
 
@@ -136,17 +141,12 @@ def get_refinement_mapper(prompts, tokenizer, max_len=77):
 def get_word_inds(text: str, word_place: int, tokenizer):
     split_text = text.split(" ")
     if type(word_place) is str:
-        word_place = [
-            i for i, word in enumerate(split_text) if word_place == word
-        ]
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
     elif type(word_place) is int:
         word_place = [word_place]
     out = []
     if len(word_place) > 0:
-        words_encode = [
-            tokenizer.decode([item]).strip("#")
-            for item in tokenizer.encode(text).input_ids
-        ][1:-1]
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1]
         cur_len, ptr = 0, 0
 
         for i in range(len(words_encode)):
@@ -175,8 +175,7 @@ def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
     cur_inds = 0
     while i < max_len and j < max_len:
         if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
-            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[
-                cur_inds]
+            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
             if len(inds_source_) == len(inds_target_):
                 mapper[inds_source_, inds_target_] = 1
             else:
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
index 81a81d63cc039..e086453002714 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
@@ -45,13 +45,10 @@ def __init__(self, paths, size=None, random_crop=False, labels=None):
         if self.size is not None and self.size > 0:
             self.rescaler = albumentations.SmallestMaxSize(max_size=self.size)
             if not self.random_crop:
-                self.cropper = albumentations.CenterCrop(
-                    height=self.size, width=self.size)
+                self.cropper = albumentations.CenterCrop(height=self.size, width=self.size)
             else:
-                self.cropper = albumentations.RandomCrop(
-                    height=self.size, width=self.size)
-            self.preprocessor = albumentations.Compose(
-                [self.rescaler, self.cropper])
+                self.cropper = albumentations.RandomCrop(height=self.size, width=self.size)
+            self.preprocessor = albumentations.Compose([self.rescaler, self.cropper])
         else:
             self.preprocessor = lambda **kwargs: kwargs
 
@@ -102,12 +99,7 @@ def __init__(self, cause, keys=None, visited=None):
         super().__init__(message)
 
 
-def retrieve(list_or_dict,
-             key,
-             splitval="/",
-             default=None,
-             expand=True,
-             pass_success=False):
+def retrieve(list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False):
     """Given a nested list or dict return the desired value at key expanding
     callable nodes if necessary and :attr:`expand` is ``True``. The expansion
     is done in-place.
@@ -150,11 +142,10 @@ def retrieve(list_or_dict,
             if callable(list_or_dict):
                 if not expand:
                     raise KeyNotFoundError(
-                        ValueError(
-                            "Trying to get past callable node with expand=False."
-                        ),
+                        ValueError("Trying to get past callable node with expand=False."),
                         keys=keys,
-                        visited=visited, )
+                        visited=visited,
+                    )
                 list_or_dict = list_or_dict()
                 parent[last_key] = list_or_dict
 
@@ -187,23 +178,19 @@ def retrieve(list_or_dict,
         return list_or_dict, success
 
 
-def give_synsets_from_indices(indices,
-                              path_to_yaml="data/imagenet_idx_to_synset.yaml"):
+def give_synsets_from_indices(indices, path_to_yaml="data/imagenet_idx_to_synset.yaml"):
     synsets = []
     with open(path_to_yaml) as f:
         di2s = yaml.load(f)
     for idx in indices:
         synsets.append(str(di2s[idx]))
-    print("Using {} different synsets for construction of Restriced Imagenet.".
-          format(len(synsets)))
+    print("Using {} different synsets for construction of Restriced Imagenet.".format(len(synsets)))
     return synsets
 
 
 def str_to_indices(string):
     """Expects a string in the format '32-123, 256, 280-321'"""
-    assert not string.endswith(
-        ","), "provided string '{}' ends with a comma, pls remove it".format(
-            string)
+    assert not string.endswith(","), "provided string '{}' ends with a comma, pls remove it".format(string)
     subs = string.split(",")
     indices = []
     for sub in subs:
@@ -236,8 +223,7 @@ def __init__(self, config=None):
         self.config = config
         if not type(self.config) == dict:
             self.config = {}
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label",
-                                                     False)
+        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
         self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
         self._prepare()
         self._prepare_synset_to_human()
@@ -255,14 +241,15 @@ def _prepare(self):
         raise NotImplementedError()
 
     def _filter_relpaths(self, relpaths):
-        ignore = set(["n06596364_9591.JPEG", ])
-        relpaths = [
-            rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore
-        ]
+        ignore = set(
+            [
+                "n06596364_9591.JPEG",
+            ]
+        )
+        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
         if "sub_indices" in self.config:
             indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(
-                indices, path_to_yaml=self.idx2syn)  # returns a list of strings
+            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
             self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
             files = []
             for rpath in relpaths:
@@ -277,8 +264,7 @@ def _prepare_synset_to_human(self):
         SIZE = 2655750
         URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
         self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if (not os.path.exists(self.human_dict) or
-                not os.path.getsize(self.human_dict) == SIZE):
+        if not os.path.exists(self.human_dict) or not os.path.getsize(self.human_dict) == SIZE:
             download(URL, self.human_dict)
 
     def _prepare_idx_to_synset(self):
@@ -289,8 +275,7 @@ def _prepare_idx_to_synset(self):
 
     def _prepare_human_to_integer_label(self):
         URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root,
-                                          "imagenet1000_clsidx_to_labels.txt")
+        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
         if not os.path.exists(self.human2integer):
             download(URL, self.human2integer)
         with open(self.human2integer, "r") as f:
@@ -306,15 +291,13 @@ def _load(self):
             self.relpaths = f.read().splitlines()
             l1 = len(self.relpaths)
             self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(
-                l1 - len(self.relpaths)))
+            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
 
         self.synsets = [p.split("/")[0] for p in self.relpaths]
         self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
 
         unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i)
-                          for i, synset in enumerate(unique_synsets))
+        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
         if not self.keep_orig_class_label:
             self.class_labels = [class_dict[s] for s in self.synsets]
         else:
@@ -339,7 +322,8 @@ def _load(self):
                 self.abspaths,
                 labels=labels,
                 size=self.size,
-                random_crop=self.random_crop, )
+                random_crop=self.random_crop,
+            )
         else:
             self.data = self.abspaths
 
@@ -348,8 +332,12 @@ class ImageNetTrain(ImageNetBase):
     NAME = "ILSVRC2012_train"
     URL = "http://www.image-net.org/challenges/LSVRC/2012/"
     AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
-    FILES = ["ILSVRC2012_img_train.tar", ]
-    SIZES = [147897477120, ]
+    FILES = [
+        "ILSVRC2012_img_train.tar",
+    ]
+    SIZES = [
+        147897477120,
+    ]
 
     def __init__(self, process_images=True, data_root=None, **kwargs):
         self.process_images = process_images
@@ -360,15 +348,13 @@ def _prepare(self):
         if self.data_root:
             self.root = os.path.join(self.data_root, self.NAME)
         else:
-            cachedir = os.environ.get("XDG_CACHE_HOME",
-                                      os.path.expanduser("~/.cache"))
+            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
             self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
 
         self.datadir = os.path.join(self.root, "data")
         self.txt_filelist = os.path.join(self.root, "filelist.txt")
         self.expected_length = 1281167
-        self.random_crop = retrieve(
-            self.config, "ImageNetTrain/random_crop", default=True)
+        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop", default=True)
         if not is_prepared(self.root):
             # prep
             print("Preparing dataset {} in {}".format(self.NAME, self.root))
@@ -376,8 +362,7 @@ def _prepare(self):
             datadir = self.datadir
             if not os.path.exists(datadir):
                 path = os.path.join(self.root, self.FILES[0])
-                if (not os.path.exists(path) or
-                        not os.path.getsize(path) == self.SIZES[0]):
+                if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]:
                     import academictorrents as at
 
                     atpath = at.get(self.AT_HASH, datastore=self.root)
@@ -391,7 +376,7 @@ def _prepare(self):
                 print("Extracting sub-tars.")
                 subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
                 for subpath in tqdm(subpaths):
-                    subdir = subpath[:-len(".tar")]
+                    subdir = subpath[: -len(".tar")]
                     os.makedirs(subdir, exist_ok=True)
                     with tarfile.open(subpath, "r:") as tar:
                         tar.extractall(path=subdir)
@@ -429,14 +414,12 @@ def _prepare(self):
         if self.data_root:
             self.root = os.path.join(self.data_root, self.NAME)
         else:
-            cachedir = os.environ.get("XDG_CACHE_HOME",
-                                      os.path.expanduser("~/.cache"))
+            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
             self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
         self.datadir = os.path.join(self.root, "data")
         self.txt_filelist = os.path.join(self.root, "filelist.txt")
         self.expected_length = 50000
-        self.random_crop = retrieve(
-            self.config, "ImageNetValidation/random_crop", default=False)
+        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop", default=False)
         if not is_prepared(self.root):
             # prep
             print("Preparing dataset {} in {}".format(self.NAME, self.root))
@@ -444,8 +427,7 @@ def _prepare(self):
             datadir = self.datadir
             if not os.path.exists(datadir):
                 path = os.path.join(self.root, self.FILES[0])
-                if (not os.path.exists(path) or
-                        not os.path.getsize(path) == self.SIZES[0]):
+                if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]:
                     import academictorrents as at
 
                     atpath = at.get(self.AT_HASH, datastore=self.root)
@@ -457,8 +439,7 @@ def _prepare(self):
                     tar.extractall(path=datadir)
 
                 vspath = os.path.join(self.root, self.FILES[1])
-                if (not os.path.exists(vspath) or
-                        not os.path.getsize(vspath) == self.SIZES[1]):
+                if not os.path.exists(vspath) or not os.path.getsize(vspath) == self.SIZES[1]:
                     download(self.VS_URL, vspath)
 
                 with open(vspath, "r") as f:
@@ -486,14 +467,15 @@ def _prepare(self):
 
 class ImageNetSR(Dataset):
     def __init__(
-            self,
-            size=None,
-            degradation=None,
-            downscale_f=4,
-            min_crop_f=0.5,
-            max_crop_f=1.0,
-            random_crop=True,
-            output_LR_image=False, ):
+        self,
+        size=None,
+        degradation=None,
+        downscale_f=4,
+        min_crop_f=0.5,
+        max_crop_f=1.0,
+        random_crop=True,
+        output_LR_image=False,
+    ):
         """
         Imagenet Superresolution Dataloader
         Performs following ops in order:
@@ -522,30 +504,22 @@ def __init__(
         assert max_crop_f <= 1.0
         self.center_crop = not random_crop
 
-        self.image_rescaler = albumentations.SmallestMaxSize(
-            max_size=size, interpolation=cv2.INTER_AREA)
+        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
 
-        self.pil_interpolation = (
-            False  # gets reset later if incase interp_op is from pillow
-        )
+        self.pil_interpolation = False  # gets reset later if incase interp_op is from pillow
 
         if degradation == "bsrgan":
-            self.degradation_process = partial(
-                degradation_fn_bsr, sf=downscale_f)
+            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
 
         elif degradation == "bsrgan_light":
-            self.degradation_process = partial(
-                degradation_fn_bsr_light, sf=downscale_f)
+            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
 
         else:
             self.pil_interpolation = degradation.startswith("pil_")
 
             if self.pil_interpolation:
                 interpolation_fn = degradation.replace("pil_", "")
-                self.degradation_process = partial(
-                    TF.resize,
-                    size=self.LR_size,
-                    interpolation=interpolation_fn)
+                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
             else:
                 interpolation_fn = {
                     "cv_nearest": cv2.INTER_NEAREST,
@@ -555,7 +529,8 @@ def __init__(
                     "cv_lanczos": cv2.INTER_LANCZOS4,
                 }[degradation]
                 self.degradation_process = albumentations.SmallestMaxSize(
-                    max_size=self.LR_size, interpolation=interpolation_fn)
+                    max_size=self.LR_size, interpolation=interpolation_fn
+                )
 
     def __len__(self):
         return len(self.base)
@@ -570,17 +545,14 @@ def __getitem__(self, i):
         image = np.array(image).astype(np.uint8)
 
         min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(
-            self.min_crop_f, self.max_crop_f, size=None)
+        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
         crop_side_len = int(crop_side_len)
 
         if self.center_crop:
-            self.cropper = albumentations.CenterCrop(
-                height=crop_side_len, width=crop_side_len)
+            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
 
         else:
-            self.cropper = albumentations.RandomCrop(
-                height=crop_side_len, width=crop_side_len)
+            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
 
         image = self.cropper(image=image)["image"]
         image = self.image_rescaler(image=image)["image"]
@@ -592,11 +564,9 @@ def __getitem__(self, i):
                 LR_image = np.array(LR_image).astype(np.uint8)
             else:
                 LR_image = self.degradation_process(image=image)["image"]
-            example["LR_image"] = ((
-                LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1]))
+            example["LR_image"] = (LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
 
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose(
-            [2, 0, 1])
+        example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
 
         return example
 
@@ -608,7 +578,9 @@ def __init__(self, **kwargs):
     def get_base(self):
         with open("data/imagenet_train_hr_indices.p", "rb") as f:
             indices = pickle.load(f)
-        dset = ImageNetTrain(process_images=False, )
+        dset = ImageNetTrain(
+            process_images=False,
+        )
         return Subset(dset, indices)
 
 
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
index 37224cba9a9d9..890a4eea89241 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
@@ -13,5 +13,4 @@
 # limitations under the License.
 
 from .bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from .bsrgan_light import \
-    degradation_bsrgan_variant as degradation_fn_bsr_light
+from .bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
index a50493c2591ea..1efdbaa95c8ca 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
@@ -51,7 +51,7 @@ def modcrop_np(img, sf):
     """
     w, h = img.shape[:2]
     im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
+    return im[: w - w % sf, : h - h % sf, ...]
 
 
 """
@@ -69,7 +69,7 @@ def analytic_kernel(k):
     # Loop over the small kernel to fill the big one
     for r in range(k_size):
         for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+            big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k
     # Crop the edges of the big kernel to ignore very small values and increase run time of SR
     crop = k_size // 2
     cropped_big_k = big_k[crop:-crop, crop:-crop]
@@ -90,9 +90,9 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
     """
 
     v = np.dot(
-        np.array(
-            [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
-        np.array([1.0, 0.0]), )
+        np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
+        np.array([1.0, 0.0]),
+    )
     V = np.array([[v[0], v[1]], [v[1], -v[0]]])
     D = np.array([[l1, 0], [0, l2]])
     Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
@@ -161,11 +161,12 @@ def blur(x, k):
 
 
 def gen_kernel(
-        k_size=np.array([15, 15]),
-        scale_factor=np.array([4, 4]),
-        min_var=0.6,
-        max_var=10.0,
-        noise_level=0, ):
+    k_size=np.array([15, 15]),
+    scale_factor=np.array([4, 4]),
+    min_var=0.6,
+    max_var=10.0,
+    noise_level=0,
+):
     """ "
     # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
     # Kai Zhang
@@ -180,14 +181,12 @@ def gen_kernel(
 
     # Set COV matrix using Lambdas and Theta
     LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array(
-        [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @LAMBDA @Q.T
+    Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+    SIGMA = Q @ LAMBDA @ Q.T
     INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
 
     # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1
-                              )  # - 0.5 * (scale_factor - k_size % 2)
+    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
     MU = MU[None, None, :, None]
 
     # Create meshgrid for Gaussian
@@ -197,7 +196,7 @@ def gen_kernel(
     # Calcualte Gaussian for every pixel of the kernel
     ZZ = Z - MU
     ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @INV_SIGMA @ZZ)) * (1 + noise)
+    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
 
     # shift the kernel so it will be centered
     # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
@@ -212,8 +211,7 @@ def fspecial_gaussian(hsize, sigma):
     hsize = [hsize, hsize]
     siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
     std = sigma
-    [x, y] = np.meshgrid(
-        np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
     arg = -(x * x + y * y) / (2 * std * std)
     h = np.exp(arg)
     h[h < scipy.finfo(float).eps * h.max()] = 0
@@ -279,9 +277,7 @@ def srmd_degradation(x, k, sf=3):
           year={2018}
         }
     """
-    x = ndimage.filters.convolve(
-        x, np.expand_dims(
-            k, axis=2), mode="wrap")  # 'nearest' | 'mirror'
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")  # 'nearest' | 'mirror'
     x = bicubic_degradation(x, sf=sf)
     return x
 
@@ -359,13 +355,11 @@ def add_blur(img, sf=4):
             ksize=2 * random.randint(2, 11) + 3,
             theta=random.random() * np.pi,
             l1=l1,
-            l2=l2, )
+            l2=l2,
+        )
     else:
-        k = fspecial("gaussian", 2 * random.randint(2, 11) + 3,
-                     wd * random.random())
-    img = ndimage.filters.convolve(
-        img, np.expand_dims(
-            k, axis=2), mode="mirror")
+        k = fspecial("gaussian", 2 * random.randint(2, 11) + 3, wd * random.random())
+    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror")
 
     return img
 
@@ -381,7 +375,8 @@ def add_resize(img, sf=4):
     img = cv2.resize(
         img,
         (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
-        interpolation=random.choice([1, 2, 3]), )
+        interpolation=random.choice([1, 2, 3]),
+    )
     img = np.clip(img, 0.0, 1.0)
 
     return img
@@ -391,18 +386,15 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
     noise_level = random.randint(noise_level1, noise_level2)
     rnum = np.random.rand()
     if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0,
-                                     img.shape).astype(np.float32)
+        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
     elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0,
-                                     (*img.shape[:2], 1)).astype(np.float32)
+        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
     else:  # add  noise
         L = noise_level2 / 255.0
         D = np.diag(np.random.rand(3))
         U = orth(np.random.rand(3, 3))
         conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal(
-            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
     img = np.clip(img, 0.0, 1.0)
     return img
 
@@ -412,32 +404,28 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
     img = np.clip(img, 0.0, 1.0)
     rnum = random.random()
     if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0,
-                                      img.shape).astype(np.float32)
+        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
     elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0,
-                                      (*img.shape[:2], 1)).astype(np.float32)
+        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
     else:
         L = noise_level2 / 255.0
         D = np.diag(np.random.rand(3))
         U = orth(np.random.rand(3, 3))
         conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal(
-            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
     img = np.clip(img, 0.0, 1.0)
     return img
 
 
 def add_Poisson_noise(img):
     img = np.clip((img * 255.0).round(), 0, 255) / 255.0
-    vals = 10**(2 * random.random() + 2.0)  # [2, 4]
+    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
     if random.random() < 0.5:
         img = np.random.poisson(img * vals).astype(np.float32) / vals
     else:
         img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
         img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
-        noise_gray = (np.random.poisson(img_gray * vals).astype(np.float32) /
-                      vals - img_gray)
+        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
         img += noise_gray[:, :, np.newaxis]
     img = np.clip(img, 0.0, 1.0)
     return img
@@ -446,8 +434,7 @@ def add_Poisson_noise(img):
 def add_JPEG_noise(img):
     quality_factor = random.randint(30, 95)
     img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode(
-        ".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+    result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
     img = cv2.imdecode(encimg, 1)
     img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
     return img
@@ -457,11 +444,10 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
     h, w = lq.shape[:2]
     rnd_h = random.randint(0, h - lq_patchsize)
     rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+    lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
 
     rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize
-            * sf, :]
+    hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :]
     return lq, hq
 
 
@@ -482,7 +468,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
     sf_ori = sf
 
     h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
     h, w = img.shape[:2]
 
     if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -495,7 +481,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
             img = cv2.resize(
                 img,
                 (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
         else:
             img = util.imresize_np(img, 1 / 2, True)
         img = np.clip(img, 0.0, 1.0)
@@ -506,7 +493,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
     if idx1 > idx2:  # keep downsample3 last
         shuffle_order[idx1], shuffle_order[idx2] = (
             shuffle_order[idx2],
-            shuffle_order[idx1], )
+            shuffle_order[idx1],
+        )
 
     for i in shuffle_order:
 
@@ -524,15 +512,13 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
                 img = cv2.resize(
                     img,
                     (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                    interpolation=random.choice([1, 2, 3]), )
+                    interpolation=random.choice([1, 2, 3]),
+                )
             else:
                 k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
                 k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum(
-                )  # blur with shifted kernel
-                img = ndimage.filters.convolve(
-                    img, np.expand_dims(
-                        k_shifted, axis=2), mode="mirror")
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror")
                 img = img[0::sf, 0::sf, ...]  # nearest downsampling
             img = np.clip(img, 0.0, 1.0)
 
@@ -541,7 +527,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
             img = cv2.resize(
                 img,
                 (int(1 / sf * a), int(1 / sf * b)),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
             img = np.clip(img, 0.0, 1.0)
 
         elif i == 4:
@@ -585,7 +572,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
     _, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
 
     h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
     h, w = image.shape[:2]
 
     if sf == 4 and random.random() < scale2_prob:  # downsample1
@@ -593,7 +580,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
             image = cv2.resize(
                 image,
                 (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
         else:
             image = util.imresize_np(image, 1 / 2, True)
         image = np.clip(image, 0.0, 1.0)
@@ -604,7 +592,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
     if idx1 > idx2:  # keep downsample3 last
         shuffle_order[idx1], shuffle_order[idx2] = (
             shuffle_order[idx2],
-            shuffle_order[idx1], )
+            shuffle_order[idx1],
+        )
 
     for i in shuffle_order:
 
@@ -621,17 +610,14 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
                 sf1 = random.uniform(1, 2 * sf)
                 image = cv2.resize(
                     image,
-                    (int(1 / sf1 * image.shape[1]),
-                     int(1 / sf1 * image.shape[0])),
-                    interpolation=random.choice([1, 2, 3]), )
+                    (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
+                    interpolation=random.choice([1, 2, 3]),
+                )
             else:
                 k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
                 k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum(
-                )  # blur with shifted kernel
-                image = ndimage.filters.convolve(
-                    image, np.expand_dims(
-                        k_shifted, axis=2), mode="mirror")
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror")
                 image = image[0::sf, 0::sf, ...]  # nearest downsampling
             image = np.clip(image, 0.0, 1.0)
 
@@ -640,7 +626,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
             image = cv2.resize(
                 image,
                 (int(1 / sf * a), int(1 / sf * b)),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
             image = np.clip(image, 0.0, 1.0)
 
         elif i == 4:
@@ -673,19 +660,21 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
         img_lq = deg_fn(img)["image"]
         img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
         print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(
-            max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
+        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[
+            "image"
+        ]
         print(img_lq.shape)
         print("bicubic", img_lq_bicubic.shape)
         print(img_hq.shape)
         lq_nearest = cv2.resize(
             util.single2uint(img_lq),
             (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-            interpolation=0, )
+            interpolation=0,
+        )
         lq_bicubic_nearest = cv2.resize(
             util.single2uint(img_lq_bicubic),
             (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-            interpolation=0, )
-        img_concat = np.concatenate(
-            [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+            interpolation=0,
+        )
+        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
         util.imsave(img_concat, str(i) + ".png")
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
index 86127e21d672e..94a515d93d914 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
@@ -29,6 +29,7 @@
 from scipy.linalg import orth
 
 from . import utils_image as util
+
 """
 # --------------------------------------------
 # Super-Resolution
@@ -51,7 +52,7 @@ def modcrop_np(img, sf):
     """
     w, h = img.shape[:2]
     im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
+    return im[: w - w % sf, : h - h % sf, ...]
 
 
 """
@@ -69,7 +70,7 @@ def analytic_kernel(k):
     # Loop over the small kernel to fill the big one
     for r in range(k_size):
         for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+            big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k
     # Crop the edges of the big kernel to ignore very small values and increase run time of SR
     crop = k_size // 2
     cropped_big_k = big_k[crop:-crop, crop:-crop]
@@ -90,9 +91,9 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
     """
 
     v = np.dot(
-        np.array(
-            [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
-        np.array([1.0, 0.0]), )
+        np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]),
+        np.array([1.0, 0.0]),
+    )
     V = np.array([[v[0], v[1]], [v[1], -v[0]]])
     D = np.array([[l1, 0], [0, l2]])
     Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
@@ -161,11 +162,12 @@ def blur(x, k):
 
 
 def gen_kernel(
-        k_size=np.array([15, 15]),
-        scale_factor=np.array([4, 4]),
-        min_var=0.6,
-        max_var=10.0,
-        noise_level=0, ):
+    k_size=np.array([15, 15]),
+    scale_factor=np.array([4, 4]),
+    min_var=0.6,
+    max_var=10.0,
+    noise_level=0,
+):
     """ "
     # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
     # Kai Zhang
@@ -180,14 +182,12 @@ def gen_kernel(
 
     # Set COV matrix using Lambdas and Theta
     LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array(
-        [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @LAMBDA @Q.T
+    Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+    SIGMA = Q @ LAMBDA @ Q.T
     INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
 
     # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1
-                              )  # - 0.5 * (scale_factor - k_size % 2)
+    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
     MU = MU[None, None, :, None]
 
     # Create meshgrid for Gaussian
@@ -197,7 +197,7 @@ def gen_kernel(
     # Calcualte Gaussian for every pixel of the kernel
     ZZ = Z - MU
     ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @INV_SIGMA @ZZ)) * (1 + noise)
+    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
 
     # shift the kernel so it will be centered
     # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
@@ -212,8 +212,7 @@ def fspecial_gaussian(hsize, sigma):
     hsize = [hsize, hsize]
     siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
     std = sigma
-    [x, y] = np.meshgrid(
-        np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
     arg = -(x * x + y * y) / (2 * std * std)
     h = np.exp(arg)
     h[h < scipy.finfo(float).eps * h.max()] = 0
@@ -279,9 +278,7 @@ def srmd_degradation(x, k, sf=3):
           year={2018}
         }
     """
-    x = ndimage.filters.convolve(
-        x, np.expand_dims(
-            k, axis=2), mode="wrap")  # 'nearest' | 'mirror'
+    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")  # 'nearest' | 'mirror'
     x = bicubic_degradation(x, sf=sf)
     return x
 
@@ -359,16 +356,10 @@ def add_blur(img, sf=4):
     if random.random() < 0.5:
         l1 = wd2 * random.random()
         l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(
-            ksize=random.randint(2, 11) + 3,
-            theta=random.random() * np.pi,
-            l1=l1,
-            l2=l2)
+        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
     else:
         k = fspecial("gaussian", random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.filters.convolve(
-        img, np.expand_dims(
-            k, axis=2), mode="mirror")
+    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror")
 
     return img
 
@@ -384,7 +375,8 @@ def add_resize(img, sf=4):
     img = cv2.resize(
         img,
         (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
-        interpolation=random.choice([1, 2, 3]), )
+        interpolation=random.choice([1, 2, 3]),
+    )
     img = np.clip(img, 0.0, 1.0)
 
     return img
@@ -394,18 +386,15 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
     noise_level = random.randint(noise_level1, noise_level2)
     rnum = np.random.rand()
     if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0,
-                                     img.shape).astype(np.float32)
+        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
     elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0,
-                                     (*img.shape[:2], 1)).astype(np.float32)
+        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
     else:  # add  noise
         L = noise_level2 / 255.0
         D = np.diag(np.random.rand(3))
         U = orth(np.random.rand(3, 3))
         conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal(
-            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
     img = np.clip(img, 0.0, 1.0)
     return img
 
@@ -415,32 +404,28 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
     img = np.clip(img, 0.0, 1.0)
     rnum = random.random()
     if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0,
-                                      img.shape).astype(np.float32)
+        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
     elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0,
-                                      (*img.shape[:2], 1)).astype(np.float32)
+        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
     else:
         L = noise_level2 / 255.0
         D = np.diag(np.random.rand(3))
         U = orth(np.random.rand(3, 3))
         conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal(
-            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
     img = np.clip(img, 0.0, 1.0)
     return img
 
 
 def add_Poisson_noise(img):
     img = np.clip((img * 255.0).round(), 0, 255) / 255.0
-    vals = 10**(2 * random.random() + 2.0)  # [2, 4]
+    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
     if random.random() < 0.5:
         img = np.random.poisson(img * vals).astype(np.float32) / vals
     else:
         img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
         img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
-        noise_gray = (np.random.poisson(img_gray * vals).astype(np.float32) /
-                      vals - img_gray)
+        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
         img += noise_gray[:, :, np.newaxis]
     img = np.clip(img, 0.0, 1.0)
     return img
@@ -449,8 +434,7 @@ def add_Poisson_noise(img):
 def add_JPEG_noise(img):
     quality_factor = random.randint(80, 95)
     img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode(
-        ".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+    result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
     img = cv2.imdecode(encimg, 1)
     img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
     return img
@@ -460,11 +444,10 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
     h, w = lq.shape[:2]
     rnd_h = random.randint(0, h - lq_patchsize)
     rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+    lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
 
     rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize
-            * sf, :]
+    hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :]
     return lq, hq
 
 
@@ -485,7 +468,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
     sf_ori = sf
 
     h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
     h, w = img.shape[:2]
 
     if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -498,7 +481,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
             img = cv2.resize(
                 img,
                 (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
         else:
             img = util.imresize_np(img, 1 / 2, True)
         img = np.clip(img, 0.0, 1.0)
@@ -509,7 +493,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
     if idx1 > idx2:  # keep downsample3 last
         shuffle_order[idx1], shuffle_order[idx2] = (
             shuffle_order[idx2],
-            shuffle_order[idx1], )
+            shuffle_order[idx1],
+        )
 
     for i in shuffle_order:
 
@@ -527,15 +512,13 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
                 img = cv2.resize(
                     img,
                     (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                    interpolation=random.choice([1, 2, 3]), )
+                    interpolation=random.choice([1, 2, 3]),
+                )
             else:
                 k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
                 k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum(
-                )  # blur with shifted kernel
-                img = ndimage.filters.convolve(
-                    img, np.expand_dims(
-                        k_shifted, axis=2), mode="mirror")
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror")
                 img = img[0::sf, 0::sf, ...]  # nearest downsampling
             img = np.clip(img, 0.0, 1.0)
 
@@ -544,7 +527,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
             img = cv2.resize(
                 img,
                 (int(1 / sf * a), int(1 / sf * b)),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
             img = np.clip(img, 0.0, 1.0)
 
         elif i == 4:
@@ -588,7 +572,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
     _, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
 
     h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
     h, w = image.shape[:2]
 
     if sf == 4 and random.random() < scale2_prob:  # downsample1
@@ -596,7 +580,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
             image = cv2.resize(
                 image,
                 (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
         else:
             image = util.imresize_np(image, 1 / 2, True)
         image = np.clip(image, 0.0, 1.0)
@@ -607,7 +592,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
     if idx1 > idx2:  # keep downsample3 last
         shuffle_order[idx1], shuffle_order[idx2] = (
             shuffle_order[idx2],
-            shuffle_order[idx1], )
+            shuffle_order[idx1],
+        )
 
     for i in shuffle_order:
 
@@ -624,17 +610,14 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
                 sf1 = random.uniform(1, 2 * sf)
                 image = cv2.resize(
                     image,
-                    (int(1 / sf1 * image.shape[1]),
-                     int(1 / sf1 * image.shape[0])),
-                    interpolation=random.choice([1, 2, 3]), )
+                    (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
+                    interpolation=random.choice([1, 2, 3]),
+                )
             else:
                 k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
                 k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum(
-                )  # blur with shifted kernel
-                image = ndimage.filters.convolve(
-                    image, np.expand_dims(
-                        k_shifted, axis=2), mode="mirror")
+                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
+                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror")
                 image = image[0::sf, 0::sf, ...]  # nearest downsampling
 
             image = np.clip(image, 0.0, 1.0)
@@ -644,7 +627,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
             image = cv2.resize(
                 image,
                 (int(1 / sf * a), int(1 / sf * b)),
-                interpolation=random.choice([1, 2, 3]), )
+                interpolation=random.choice([1, 2, 3]),
+            )
             image = np.clip(image, 0.0, 1.0)
 
         elif i == 4:
@@ -677,19 +661,21 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
         img_lq = deg_fn(img)["image"]
         img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
         print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(
-            max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
+        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[
+            "image"
+        ]
         print(img_lq.shape)
         print("bicubic", img_lq_bicubic.shape)
         print(img_hq.shape)
         lq_nearest = cv2.resize(
             util.single2uint(img_lq),
             (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-            interpolation=0, )
+            interpolation=0,
+        )
         lq_bicubic_nearest = cv2.resize(
             util.single2uint(img_lq_bicubic),
             (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-            interpolation=0, )
-        img_concat = np.concatenate(
-            [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+            interpolation=0,
+        )
+        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
         util.imsave(img_concat, str(i) + ".png")
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
index 1e21fe66a10b6..be3bdaa3321cc 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
@@ -71,14 +71,12 @@ def cubic(x):
     absx = paddle.abs(x)
     absx2 = absx**2
     absx3 = absx**3
-    return (1.5 * absx3 - 2.5 * absx2 + 1) * (
-        (absx <= 1).astype(absx.dtype)) + (
-            -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2) * ((
-                (absx > 1) * (absx <= 2)).astype(absx.dtype))
+    return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).astype(absx.dtype)) + (
+        -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
+    ) * (((absx > 1) * (absx <= 2)).astype(absx.dtype))
 
 
-def calculate_weights_indices(in_length, out_length, scale, kernel,
-                              kernel_width, antialiasing):
+def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
     if (scale < 1) and (antialiasing):
         # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
         kernel_width = kernel_width / scale
@@ -102,14 +100,13 @@ def calculate_weights_indices(in_length, out_length, scale, kernel,
 
     # The indices of the input pixels involved in computing the k-th output
     # pixel are in row k of the indices matrix.
-    indices = left.reshape([out_length, 1]).expand(
-        [out_length, P]) + paddle.linspace(0, P - 1, P).reshape([1, P]).expand(
-            [out_length, P])
+    indices = left.reshape([out_length, 1]).expand([out_length, P]) + paddle.linspace(0, P - 1, P).reshape(
+        [1, P]
+    ).expand([out_length, P])
 
     # The weights used to compute the k-th output pixel are in row k of the
     # weights matrix.
-    distance_to_center = u.reshape([out_length, 1]).expand(
-        [out_length, P]) - indices
+    distance_to_center = u.reshape([out_length, 1]).expand([out_length, P]) - indices
     # apply cubic kernel
     if (scale < 1) and (antialiasing):
         weights = scale * cubic(distance_to_center * scale)
@@ -158,13 +155,15 @@ def imresize_np(img, scale, antialiasing=True):
 
     # get weights and indices
     weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
+        in_H, out_H, scale, kernel, kernel_width, antialiasing
+    )
     weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
+        in_W, out_W, scale, kernel, kernel_width, antialiasing
+    )
     # process H dimension
     # symmetric copying
     img_aug = paddle.zeros([in_H + sym_len_Hs + sym_len_He, in_W, in_C])
-    img_aug[sym_len_Hs:sym_len_Hs + in_H] = img
+    img_aug[sym_len_Hs : sym_len_Hs + in_H] = img
 
     sym_patch = img[:sym_len_Hs, :, :]
     inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64")
@@ -174,20 +173,19 @@ def imresize_np(img, scale, antialiasing=True):
     sym_patch = img[-sym_len_He:, :, :]
     inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64")
     sym_patch_inv = sym_patch.index_select(inv_idx, axis=0)
-    img_aug[sym_len_Hs + in_H:sym_len_Hs + in_H + sym_len_He] = sym_patch_inv
+    img_aug[sym_len_Hs + in_H : sym_len_Hs + in_H + sym_len_He] = sym_patch_inv
 
     out_1 = paddle.zeros([out_H, in_W, in_C])
     kernel_width = weights_H.shape[1]
     for i in range(out_H):
         idx = int(indices_H[i][0])
         for j in range(out_C):
-            out_1[i, :, j] = (img_aug[idx:idx + kernel_width, :, j]
-                              .transpose([1, 0]).mv(weights_H[i]))
+            out_1[i, :, j] = img_aug[idx : idx + kernel_width, :, j].transpose([1, 0]).mv(weights_H[i])
 
     # process W dimension
     # symmetric copying
     out_1_aug = paddle.zeros([out_H, in_W + sym_len_Ws + sym_len_We, in_C])
-    out_1_aug[:, sym_len_Ws:sym_len_Ws + in_W] = out_1
+    out_1_aug[:, sym_len_Ws : sym_len_Ws + in_W] = out_1
 
     sym_patch = out_1[:, :sym_len_Ws, :]
     inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64")
@@ -197,16 +195,14 @@ def imresize_np(img, scale, antialiasing=True):
     sym_patch = out_1[:, -sym_len_We:, :]
     inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64")
     sym_patch_inv = sym_patch.index_select(inv_idx, axis=1)
-    out_1_aug[:, sym_len_Ws + in_W:sym_len_Ws + in_W +
-              sym_len_We] = sym_patch_inv
+    out_1_aug[:, sym_len_Ws + in_W : sym_len_Ws + in_W + sym_len_We] = sym_patch_inv
 
     out_2 = paddle.zeros([out_H, out_W, in_C])
     kernel_width = weights_W.shape[1]
     for i in range(out_W):
         idx = int(indices_W[i][0])
         for j in range(out_C):
-            out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(
-                weights_W[i])
+            out_2[:, i, j] = out_1_aug[:, idx : idx + kernel_width, j].mv(weights_W[i])
     if need_squeeze:
         out_2 = out_2.squeeze()
 
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
index 3d8311776fdb3..a1d4f642125ae 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
@@ -22,47 +22,36 @@
 import paddle.nn.functional as F
 from paddle.utils.download import get_weights_path_from_url
 
-from ppdiffusers.initializer import (constant_, normal_,
-                                     reset_initialized_parameter)
+from ppdiffusers.initializer import constant_, normal_, reset_initialized_parameter
 
 model_urls = {
     "vgg16": (
         "https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/lpips_vgg16.pdparams",
-        "a1583475db9e49334735f2866847ae41", ),
+        "a1583475db9e49334735f2866847ae41",
+    ),
     "vgg_netlin": (
         "https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/vgg_netlin.pdparams",
-        "f3ae85f16a1a243e789606ae0c4a59a1", ),
+        "f3ae85f16a1a243e789606ae0c4a59a1",
+    ),
 }
 
 
 class ActNorm(nn.Layer):
-    def __init__(self,
-                 num_features,
-                 logdet=False,
-                 affine=True,
-                 allow_reverse_init=False):
+    def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False):
         assert affine
         super().__init__()
         self.logdet = logdet
-        self.loc = self.create_parameter(
-            (1, num_features, 1, 1),
-            default_initializer=nn.initializer.Constant(0))
-        self.scale = self.create_parameter(
-            (1, num_features, 1, 1),
-            default_initializer=nn.initializer.Constant(1))
+        self.loc = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(0))
+        self.scale = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(1))
         self.allow_reverse_init = allow_reverse_init
 
-        self.register_buffer(
-            "initialized", paddle.to_tensor(
-                0, dtype=paddle.int64))
+        self.register_buffer("initialized", paddle.to_tensor(0, dtype=paddle.int64))
 
     @paddle.no_grad()
     def initialize(self, input):
         flatten = input.transpose([1, 0, 2, 3]).reshape([input.shape[1], -1])
-        mean = (flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3)
-                .transpose([1, 0, 2, 3]))
-        std = (flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3)
-               .transpose([1, 0, 2, 3]))
+        mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3])
+        std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3])
 
         self.loc.set_value(-mean)
         self.scale.set_value(1 / (std + 1e-6))
@@ -80,9 +69,7 @@ def forward(self, input, reverse=False):
 
         if self.training and self.initialized.item() == 0:
             self.initialize(input)
-            self.initialized.set_value(
-                paddle.to_tensor(
-                    1, dtype=self.initialized.dtype))
+            self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype))
 
         h = self.scale * (input + self.loc)
 
@@ -106,9 +93,7 @@ def reverse(self, output):
                 )
             else:
                 self.initialize(output)
-                self.initialized.set_value(
-                    paddle.to_tensor(
-                        1, dtype=self.initialized.dtype))
+                self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype))
 
         if len(output.shape) == 2:
             output = output[:, :, None, None]
@@ -137,8 +122,7 @@ def hinge_d_loss(logits_real, logits_fake):
 
 
 def vanilla_d_loss(logits_real, logits_fake):
-    d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) +
-                    paddle.mean(F.softplus(logits_fake)))
+    d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) + paddle.mean(F.softplus(logits_fake)))
     return d_loss
 
 
@@ -170,8 +154,7 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
             norm_layer = nn.BatchNorm2D
         else:
             norm_layer = ActNorm
-        if (type(norm_layer) == functools.
-                partial):  # no need to use bias as BatchNorm2d has affine parameters
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
             use_bias = norm_layer.func != nn.BatchNorm2D
         else:
             use_bias = norm_layer != nn.BatchNorm2D
@@ -179,8 +162,7 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
         kw = 4
         padw = 1
         sequence = [
-            nn.Conv2D(
-                input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+            nn.Conv2D(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
             nn.LeakyReLU(0.2),
         ]
         nf_mult = 1
@@ -195,7 +177,8 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
                     kernel_size=kw,
                     stride=2,
                     padding=padw,
-                    bias_attr=use_bias, ),
+                    bias_attr=use_bias,
+                ),
                 norm_layer(ndf * nf_mult),
                 nn.LeakyReLU(0.2),
             ]
@@ -209,14 +192,14 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
                 kernel_size=kw,
                 stride=1,
                 padding=padw,
-                bias_attr=use_bias, ),
+                bias_attr=use_bias,
+            ),
             norm_layer(ndf * nf_mult),
             nn.LeakyReLU(0.2),
         ]
 
         sequence += [
-            nn.Conv2D(
-                ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
+            nn.Conv2D(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
         ]  # output 1 channel prediction map
         self.main = nn.Sequential(*sequence)
 
@@ -229,10 +212,8 @@ def spatial_average(in_tens, keepdim=True):
     return in_tens.mean([2, 3], keepdim=keepdim)
 
 
-def upsample(in_tens,
-             out_HW=(64, 64)):  # assumes scale factor is same for H and W
-    return nn.Upsample(
-        size=out_HW, mode="bilinear", align_corners=False)(in_tens)
+def upsample(in_tens, out_HW=(64, 64)):  # assumes scale factor is same for H and W
+    return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens)
 
 
 def normalize_tensor(in_feat, eps=1e-10):
@@ -246,10 +227,15 @@ class NetLinLayer(nn.Layer):
     def __init__(self, chn_in, chn_out=1, use_dropout=False):
         super(NetLinLayer, self).__init__()
 
-        layers = ([nn.Dropout(), ] if (use_dropout) else [])
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
         layers += [
-            nn.Conv2D(
-                chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False),
+            nn.Conv2D(chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False),
         ]
         self.model = nn.Sequential(*layers)
 
@@ -262,14 +248,12 @@ def __init__(self):
         super(ScalingLayer, self).__init__()
         self.register_buffer(
             "shift",
-            paddle.to_tensor(
-                np.asarray([-0.030, -0.088, -0.188]).astype("float32")[
-                    None, :, None, None]), )
+            paddle.to_tensor(np.asarray([-0.030, -0.088, -0.188]).astype("float32")[None, :, None, None]),
+        )
         self.register_buffer(
             "scale",
-            paddle.to_tensor(
-                np.asarray([0.458, 0.448, 0.450]).astype("float32")[
-                    None, :, None, None]), )
+            paddle.to_tensor(np.asarray([0.458, 0.448, 0.450]).astype("float32")[None, :, None, None]),
+        )
 
     def forward(self, inp):
         return (inp - self.shift) / self.scale
@@ -280,8 +264,7 @@ def __init__(self, pretrained=True, requires_grad=False):
         super(VGG16, self).__init__()
         vgg_model = paddle.vision.models.vgg16(pretrained=False)
         if pretrained:
-            state_dict = paddle.load(
-                get_weights_path_from_url(*model_urls["vgg16"]))
+            state_dict = paddle.load(get_weights_path_from_url(*model_urls["vgg16"]))
             vgg_model.set_state_dict(state_dict)
         vgg_pretrained_features = vgg_model.features
         self.slice1 = nn.Sequential()
@@ -315,9 +298,7 @@ def forward(self, X):
         h_relu4_3 = h
         h = self.slice5(h)
         h_relu5_3 = h
-        vgg_outputs = namedtuple(
-            "VggOutputs",
-            ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
+        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
         out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
 
         return out
@@ -325,25 +306,27 @@ def forward(self, X):
 
 class LPIPS(nn.Layer):
     def __init__(
-            self,
-            pretrained=True,
-            net="alex",
-            lpips=True,
-            spatial=False,
-            pnet_rand=False,
-            pnet_tune=False,
-            use_dropout=True,
-            model_path=None,
-            eval_mode=True,
-            verbose=True, ):
+        self,
+        pretrained=True,
+        net="alex",
+        lpips=True,
+        spatial=False,
+        pnet_rand=False,
+        pnet_tune=False,
+        use_dropout=True,
+        model_path=None,
+        eval_mode=True,
+        verbose=True,
+    ):
         # lpips - [True] means with linear calibration on top of base network
         # pretrained - [True] means load linear weights
 
         super(LPIPS, self).__init__()
         if verbose:
-            print("Setting up [%s] perceptual loss: trunk [%s], spatial [%s]" %
-                  ("LPIPS" if lpips else "baseline", net, "on"
-                   if spatial else "off"))
+            print(
+                "Setting up [%s] perceptual loss: trunk [%s], spatial [%s]"
+                % ("LPIPS" if lpips else "baseline", net, "on" if spatial else "off")
+            )
 
         self.pnet_type = net.lower()
         self.pnet_tune = pnet_tune
@@ -359,8 +342,7 @@ def __init__(
             raise NotImplementedError
         self.L = len(self.chns)
 
-        self.net = net_type(
-            pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
+        self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
 
         if lpips:
             lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
@@ -377,8 +359,7 @@ def __init__(
 
             if pretrained:
                 if model_path is None:
-                    model_path = get_weights_path_from_url(*model_urls[
-                        "vgg_netlin"])
+                    model_path = get_weights_path_from_url(*model_urls["vgg_netlin"])
                 if verbose:
                     print("Loading model from: %s" % model_path)
                 import warnings
@@ -393,47 +374,29 @@ def __init__(
             param.stop_gradient = True
 
     def forward(self, in0, in1, retPerLayer=False, normalize=False):
-        if (normalize):  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
             in0 = 2 * in0 - 1
             in1 = 2 * in1 - 1
 
         # v0.0 - original release had a bug, where input was not scaled
-        in0_input, in1_input = (self.scaling_layer(in0),
-                                self.scaling_layer(in1))
+        in0_input, in1_input = (self.scaling_layer(in0), self.scaling_layer(in1))
         outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
         feats0, feats1, diffs = {}, {}, {}
 
         for kk in range(self.L):
-            feats0[kk], feats1[kk] = normalize_tensor(outs0[
-                kk]), normalize_tensor(outs1[kk])
-            diffs[kk] = (feats0[kk] - feats1[kk])**2
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
 
         if self.lpips:
             if self.spatial:
-                res = [
-                    upsample(
-                        self.lins[kk](diffs[kk]), out_HW=in0.shape[2:])
-                    for kk in range(self.L)
-                ]
+                res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)]
             else:
-                res = [
-                    spatial_average(
-                        self.lins[kk](diffs[kk]), keepdim=True)
-                    for kk in range(self.L)
-                ]
+                res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
         else:
             if self.spatial:
-                res = [
-                    upsample(
-                        diffs[kk].sum(axis=1, keepdim=True),
-                        out_HW=in0.shape[2:]) for kk in range(self.L)
-                ]
+                res = [upsample(diffs[kk].sum(axis=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)]
             else:
-                res = [
-                    spatial_average(
-                        diffs[kk].sum(axis=1, keepdim=True), keepdim=True)
-                    for kk in range(self.L)
-                ]
+                res = [spatial_average(diffs[kk].sum(axis=1, keepdim=True), keepdim=True) for kk in range(self.L)]
 
         val = res[0]
         for l in range(1, self.L):
@@ -447,19 +410,20 @@ def forward(self, in0, in1, retPerLayer=False, normalize=False):
 
 class LPIPSWithDiscriminator(nn.Layer):
     def __init__(
-            self,
-            disc_start,
-            logvar_init=0.0,
-            kl_weight=1.0,
-            pixelloss_weight=1.0,
-            disc_num_layers=3,
-            disc_in_channels=3,
-            disc_factor=1.0,
-            disc_weight=1.0,
-            perceptual_weight=1.0,
-            use_actnorm=False,
-            disc_conditional=False,
-            disc_loss="hinge", ):
+        self,
+        disc_start,
+        logvar_init=0.0,
+        kl_weight=1.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_loss="hinge",
+    ):
 
         super().__init__()
         assert disc_loss in ["hinge", "vanilla"]
@@ -471,15 +435,13 @@ def __init__(
 
         self.perceptual_weight = perceptual_weight
         self.discriminator = NLayerDiscriminator(
-            input_nc=disc_in_channels,
-            n_layers=disc_num_layers,
-            use_actnorm=use_actnorm)
+            input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm
+        )
         reset_initialized_parameter(self.discriminator)
         self.discriminator.apply(weights_init)
 
         # output log variance
-        self.logvar = self.create_parameter(
-            (1, ), default_initializer=nn.initializer.Constant(logvar_init))
+        self.logvar = self.create_parameter((1,), default_initializer=nn.initializer.Constant(logvar_init))
 
         self.discriminator_iter_start = disc_start
         self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
@@ -489,15 +451,11 @@ def __init__(
 
     def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
         if last_layer is not None:
-            nll_grads = paddle.autograd.grad(
-                nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = paddle.autograd.grad(
-                g_loss, last_layer, retain_graph=True)[0]
+            nll_grads = paddle.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = paddle.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
         else:
-            nll_grads = paddle.autograd.grad(
-                nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = paddle.autograd.grad(
-                g_loss, self.last_layer[0], retain_graph=True)[0]
+            nll_grads = paddle.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+            g_grads = paddle.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
 
         d_weight = paddle.norm(nll_grads) / (paddle.norm(g_grads) + 1e-4)
         d_weight = paddle.clip(d_weight, 0.0, 1e4).detach()
@@ -505,16 +463,17 @@ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
         return d_weight
 
     def forward(
-            self,
-            inputs,
-            reconstructions,
-            posteriors,
-            optimizer_idx,
-            global_step,
-            last_layer=None,
-            cond=None,
-            split="train",
-            weights=None, ):
+        self,
+        inputs,
+        reconstructions,
+        posteriors,
+        optimizer_idx,
+        global_step,
+        last_layer=None,
+        cond=None,
+        split="train",
+        weights=None,
+    ):
         rec_loss = paddle.abs(inputs - reconstructions)
         if self.perceptual_weight > 0:
             p_loss = self.perceptual_loss(inputs, reconstructions)
@@ -525,8 +484,7 @@ def forward(
         weighted_nll_loss = nll_loss
         if weights is not None:
             weighted_nll_loss = weights * nll_loss
-        weighted_nll_loss = paddle.sum(
-            weighted_nll_loss) / weighted_nll_loss.shape[0]
+        weighted_nll_loss = paddle.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
         nll_loss = paddle.sum(nll_loss) / nll_loss.shape[0]
         kl_loss = posteriors.kl()
         kl_loss = paddle.sum(kl_loss) / kl_loss.shape[0]
@@ -539,37 +497,28 @@ def forward(
                 logits_fake = self.discriminator(reconstructions)
             else:
                 assert self.disc_conditional
-                logits_fake = self.discriminator(
-                    paddle.concat(
-                        (reconstructions, cond), axis=1))
+                logits_fake = self.discriminator(paddle.concat((reconstructions, cond), axis=1))
             g_loss = -paddle.mean(logits_fake)
             if self.disc_factor > 0.0:
                 try:
-                    d_weight = self.calculate_adaptive_weight(
-                        nll_loss, g_loss, last_layer=last_layer)
+                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
                 except Exception:
                     assert not self.training
                     d_weight = paddle.to_tensor(0.0)
             else:
                 d_weight = paddle.to_tensor(0.0)
 
-            disc_factor = adopt_weight(
-                self.disc_factor,
-                global_step,
-                threshold=self.discriminator_iter_start)
-            loss = (weighted_nll_loss + self.kl_weight * kl_loss + d_weight *
-                    disc_factor * g_loss)
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
 
             log = {
-                "{}/total_loss".format(split):
-                loss.clone().detach().mean().item(),
+                "{}/total_loss".format(split): loss.clone().detach().mean().item(),
                 "{}/logvar".format(split): self.logvar.detach().item(),
                 "{}/kl_loss".format(split): kl_loss.detach().mean().item(),
                 "{}/nll_loss".format(split): nll_loss.detach().mean().item(),
                 "{}/rec_loss".format(split): rec_loss.detach().mean().item(),
                 "{}/d_weight".format(split): d_weight.detach().item(),
-                "{}/disc_factor".format(split):
-                paddle.to_tensor(disc_factor).item(),
+                "{}/disc_factor".format(split): paddle.to_tensor(disc_factor).item(),
                 "{}/g_loss".format(split): g_loss.detach().mean().item(),
             }
             return loss, log
@@ -580,24 +529,14 @@ def forward(
                 logits_real = self.discriminator(inputs.detach())
                 logits_fake = self.discriminator(reconstructions.detach())
             else:
-                logits_real = self.discriminator(
-                    paddle.concat(
-                        (inputs.detach(), cond), axis=1))
-                logits_fake = self.discriminator(
-                    paddle.concat(
-                        (reconstructions.detach(), cond), axis=1))
-            disc_factor = adopt_weight(
-                self.disc_factor,
-                global_step,
-                threshold=self.discriminator_iter_start)
+                logits_real = self.discriminator(paddle.concat((inputs.detach(), cond), axis=1))
+                logits_fake = self.discriminator(paddle.concat((reconstructions.detach(), cond), axis=1))
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
             d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
 
             log = {
-                "{}/disc_loss".format(split):
-                d_loss.clone().detach().mean().item(),
-                "{}/logits_real".format(split):
-                logits_real.detach().mean().item(),
-                "{}/logits_fake".format(split):
-                logits_fake.detach().mean().item(),
+                "{}/disc_loss".format(split): d_loss.clone().detach().mean().item(),
+                "{}/logits_real".format(split): logits_real.detach().mean().item(),
+                "{}/logits_fake".format(split): logits_fake.detach().mean().item(),
             }
             return d_loss, log
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/model.py b/ppdiffusers/examples/autoencoder/vae/ldm/model.py
index 81cd75c9787bc..5df1c98fe4c61 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/model.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/model.py
@@ -22,8 +22,13 @@
 from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
 from ppdiffusers.initializer import reset_initialized_parameter
 from ppdiffusers.models.autoencoder_kl import (
-    AutoencoderKLOutput, Decoder, DecoderOutput, DiagonalGaussianDistribution,
-    Encoder)
+    AutoencoderKLOutput,
+    Decoder,
+    DecoderOutput,
+    DiagonalGaussianDistribution,
+    Encoder,
+)
+
 # from ppdiffusers.models.ema import LitEma
 from ppdiffusers.models.modeling_utils import ModelMixin
 
@@ -33,8 +38,7 @@
 def count_params(model, verbose=True):
     total_params = sum(p.numel() for p in model.parameters()).item()
     if verbose:
-        print(
-            f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
     return total_params
 
 
@@ -44,59 +48,62 @@ class AutoencoderKLWithLoss(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            in_channels: int=3,
-            out_channels: int=3,
-            down_block_types: Tuple[str]=(
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D", ),
-            down_block_out_channels: Tuple[int]=None,
-            up_block_types: Tuple[str]=(
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D", ),
-            up_block_out_channels: Tuple[int]=None,
-            block_out_channels: Tuple[int]=(128, 256, 512, 512),
-            layers_per_block: int=2,
-            act_fn: str="silu",
-            latent_channels: int=4,
-            norm_num_groups: int=32,
-            sample_size: int=512,
-            # new add
-            input_size: Tuple[int]=None,
-            # loss arguments
-            disc_start=50001,
-            kl_weight=1.0e-6,
-            disc_weight=0.5,
-            logvar_init=0.0,
-            pixelloss_weight=1.0,
-            disc_num_layers=3,
-            disc_in_channels=3,
-            disc_factor=1.0,
-            perceptual_weight=1.0,
-            use_actnorm=False,
-            disc_conditional=False,
-            disc_loss="hinge",
-            use_ema=False,
-            ema_decay=None, ):
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = (
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ),
+        down_block_out_channels: Tuple[int] = None,
+        up_block_types: Tuple[str] = (
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+        ),
+        up_block_out_channels: Tuple[int] = None,
+        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        layers_per_block: int = 2,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 512,
+        # new add
+        input_size: Tuple[int] = None,
+        # loss arguments
+        disc_start=50001,
+        kl_weight=1.0e-6,
+        disc_weight=0.5,
+        logvar_init=0.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_loss="hinge",
+        use_ema=False,
+        ema_decay=None,
+    ):
         super().__init__()
-        self.input_size = ([int(_) for _ in input_size]
-                           if input_size is not None else None)
+        self.input_size = [int(_) for _ in input_size] if input_size is not None else None
         self.encoder = Encoder(
             in_channels=in_channels,
             out_channels=latent_channels,
             down_block_types=down_block_types,
             block_out_channels=down_block_out_channels
-            if down_block_out_channels is
-            not None  # if down_block_out_channels not givien, we will use block_out_channels
+            if down_block_out_channels
+            is not None  # if down_block_out_channels not givien, we will use block_out_channels
             else block_out_channels,
             layers_per_block=layers_per_block,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
-            double_z=True, )
+            double_z=True,
+        )
 
         # pass init params to Decoder
         self.decoder = Decoder(
@@ -104,10 +111,12 @@ def __init__(
             out_channels=out_channels,
             up_block_types=up_block_types,
             block_out_channels=up_block_out_channels  # if up_block_out_channels not givien, we will use block_out_channels
-            if up_block_out_channels is not None else block_out_channels,
+            if up_block_out_channels is not None
+            else block_out_channels,
             layers_per_block=layers_per_block,
             norm_num_groups=norm_num_groups,
-            act_fn=act_fn, )
+            act_fn=act_fn,
+        )
 
         self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
         self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1)
@@ -125,7 +134,8 @@ def __init__(
             perceptual_weight=perceptual_weight,
             use_actnorm=use_actnorm,
             disc_conditional=disc_conditional,
-            disc_loss=disc_loss, )
+            disc_loss=disc_loss,
+        )
         count_params(self)
         self.init_weights()
         self.use_ema = use_ema
@@ -143,9 +153,10 @@ def init_weights(self):
         reset_initialized_parameter(self.post_quant_conv)
 
     def custom_forward(
-            self,
-            sample: paddle.Tensor,
-            sample_posterior: bool=True, ):
+        self,
+        sample: paddle.Tensor,
+        sample_posterior: bool = True,
+    ):
         posterior = self.encode(sample).latent_dist
         if sample_posterior:
             z = posterior.sample()
@@ -183,8 +194,7 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0):
         if self.input_size is None:
             encoder_inputs = pixel_values
         else:
-            encoder_inputs = F.interpolate(
-                pixel_values, size=self.input_size, mode="bilinear")
+            encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
 
         reconstructions, posterior = self.custom_forward(encoder_inputs)
 
@@ -197,7 +207,8 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0):
                 optimizer_idx,
                 global_step,
                 last_layer=self.get_last_layer(),
-                split="train", )
+                split="train",
+            )
             return aeloss, log_dict_ae
 
         if optimizer_idx == 1:
@@ -209,7 +220,8 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0):
                 optimizer_idx,
                 global_step,
                 last_layer=self.get_last_layer(),
-                split="train", )
+                split="train",
+            )
             return discloss, log_dict_disc
 
     @paddle.no_grad()
@@ -219,21 +231,18 @@ def log_images(self, pixel_values, only_inputs=False, **kwargs):
         if self.input_size is None:
             encoder_inputs = pixel_values
         else:
-            encoder_inputs = F.interpolate(
-                pixel_values, size=self.input_size, mode="bilinear")
+            encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
 
         if not only_inputs:
             xrec, posterior = self.custom_forward(encoder_inputs)
-            log["samples"] = self.decode_image(
-                self.decode(paddle.randn(posterior.sample().shape)).sample)
+            log["samples"] = self.decode_image(self.decode(paddle.randn(posterior.sample().shape)).sample)
             log["reconstructions"] = self.decode_image(xrec)
             if self.use_ema:
                 with self.ema_scope():
-                    xrec_ema, posterior_ema = self.custom_forward(
-                        encoder_inputs)
+                    xrec_ema, posterior_ema = self.custom_forward(encoder_inputs)
                     log["samples_ema"] = self.decode_image(
-                        self.decode(
-                            paddle.randn(posterior_ema.sample().shape)).sample)
+                        self.decode(paddle.randn(posterior_ema.sample().shape)).sample
+                    )
                     log["reconstructions_ema"] = self.decode_image(xrec_ema)
         # update
         log["encoder_inputs"] = self.decode_image(encoder_inputs)
@@ -247,12 +256,10 @@ def decode_image(self, image):
 
     @paddle.no_grad()
     def validation_step(self, pixel_values, global_step=0):
-        log_dict_ae, log_dict_disc = self._validation_step(pixel_values,
-                                                           global_step)
+        log_dict_ae, log_dict_disc = self._validation_step(pixel_values, global_step)
         if self.use_ema:
             with self.ema_scope():
-                log_dict_ae_ema, log_dict_disc_ema = self._validation_step(
-                    pixel_values, global_step, postfix="_ema")
+                log_dict_ae_ema, log_dict_disc_ema = self._validation_step(pixel_values, global_step, postfix="_ema")
                 log_dict_ae.update(log_dict_ae_ema)
                 log_dict_disc.update(log_dict_disc_ema)
 
@@ -263,8 +270,7 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""):
         if self.input_size is None:
             encoder_inputs = pixel_values
         else:
-            encoder_inputs = F.interpolate(
-                pixel_values, size=self.input_size, mode="bilinear")
+            encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
 
         reconstructions, posterior = self.custom_forward(encoder_inputs)
         aeloss, log_dict_ae = self.loss(
@@ -274,7 +280,8 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""):
             0,
             global_step,
             last_layer=self.get_last_layer(),
-            split="val" + postfix, )
+            split="val" + postfix,
+        )
 
         discloss, log_dict_disc = self.loss(
             pixel_values,
@@ -283,7 +290,8 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""):
             1,
             global_step,
             last_layer=self.get_last_layer(),
-            split="val" + postfix, )
+            split="val" + postfix,
+        )
         self.train()
         return log_dict_ae, log_dict_disc
 
@@ -333,26 +341,25 @@ def untoggle_optimizer(self, optimizers, optimizer_idx):
             if optimizer_idx != opt_idx:
                 for param in opt._parameter_list:
                     if param in self._param_stop_gradient_state:
-                        param.stop_gradient = self._param_stop_gradient_state[
-                            param]
+                        param.stop_gradient = self._param_stop_gradient_state[param]
         # save memory
         self._param_stop_gradient_state = {}
 
-    def encode(self, x: paddle.Tensor, return_dict: bool=True):
+    def encode(self, x: paddle.Tensor, return_dict: bool = True):
         h = self.encoder(x)
         moments = self.quant_conv(h)
         posterior = DiagonalGaussianDistribution(moments)
 
         if not return_dict:
-            return (posterior, )
+            return (posterior,)
 
         return AutoencoderKLOutput(latent_dist=posterior)
 
-    def decode(self, z: paddle.Tensor, return_dict: bool=True):
+    def decode(self, z: paddle.Tensor, return_dict: bool = True):
         z = self.post_quant_conv(z)
         dec = self.decoder(z)
 
         if not return_dict:
-            return (dec, )
+            return (dec,)
 
         return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
index 8d3f4a8f4ac7a..4a91b34df3acc 100644
--- a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
+++ b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
@@ -77,22 +77,25 @@ def _get_param(self, img, output_size):
 
 class TextImagePair(IterableDataset):
     def __init__(
-            self,
-            file_list,
-            size,
-            num_records,
-            image_processing=None,
-            buffer_size=1000,
-            shuffle_every_n_samples=5,
-            interpolation="lanczos", ):
+        self,
+        file_list,
+        size,
+        num_records,
+        image_processing=None,
+        buffer_size=1000,
+        shuffle_every_n_samples=5,
+        interpolation="lanczos",
+    ):
         self.size = size
         if image_processing is None:
-            self.image_processing = transforms.Compose([
-                transforms.Resize(int(size / 0.9), interpolation),
-                RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize(0.5, 0.5),
-            ])
+            self.image_processing = transforms.Compose(
+                [
+                    transforms.Resize(int(size / 0.9), interpolation),
+                    RandomCrop(size),
+                    transforms.ToTensor(),
+                    transforms.Normalize(0.5, 0.5),
+                ]
+            )
         else:
             self.image_processing = image_processing
         self.file_list = []
@@ -115,19 +118,14 @@ def __init__(
             file_weights = file_weights / file_weight_sum
             print(f"sample weights of files: {file_weights}")
             self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate(
-                [[0.0], self.file_weights_cumsum])
+            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
         else:
             print("sample each file list with same probabiliy")
             self.file_weights_cumsum = None
 
         self.num_records = num_records
-        self.file_ids = [
-            np.arange(len(filelist)) for filelist in self.file_list
-        ]
-        print(
-            f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
-        )
+        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
         self.buffer_size = buffer_size
         self.shuffle_every_n_samples = shuffle_every_n_samples
 
@@ -136,9 +134,7 @@ def sample_loader(self, file_ids, filenames):
             random.shuffle(file_ids)
             for i in file_ids:
                 filename = filenames[i].strip("\n")
-                with gzip.open(filename,
-                               "rb") if filename.endswith(".gz") else open(
-                                   filename, "rb") as f:
+                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
                     retry = 0
                     while True:
                         line = f.readline()
@@ -167,12 +163,9 @@ def sample_loader(self, file_ids, filenames):
                             yield data
 
     def random_load_from_multi_dataset(self):
-        print(
-            f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
-        )
+        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
         sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
-            for i in range(len(self.file_ids))
+            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
         ]
 
         while True:
@@ -181,8 +174,7 @@ def random_load_from_multi_dataset(self):
             else:
                 rand_num = random.random()
                 for i in range(len(self.file_list)):
-                    if (self.file_weights_cumsum[i] <= rand_num <
-                            self.file_weights_cumsum[i + 1]):
+                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
                         break
                 sample_loader = sample_loader_per_dataset[i]
                 # debug
@@ -211,8 +203,7 @@ def __iter__(self):
         return self.shuffle(iter(self.random_load_from_multi_dataset()))
 
 
-def split_data_per_worker(dataset, worker_id, local_rank, world_size,
-                          num_workers):
+def split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers):
     worker_global_id = local_rank * num_workers + worker_id
     dataset.rng = np.random.RandomState(worker_global_id)
     for i in range(len(dataset.file_ids)):
@@ -238,8 +229,7 @@ def worker_init_fn(_):
     world_size = dist.get_world_size()
     num_workers = worker_info.num_workers
     if isinstance(dataset, TextImagePair):
-        split_data_per_worker(dataset, worker_id, local_rank, world_size,
-                              num_workers)
+        split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers)
         return np.random.seed(np.random.get_state()[1][0] + worker_id)
     else:
         return np.random.seed(np.random.get_state()[1][0] + worker_id)
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
index 08141d43c821e..ebfb3ff1df677 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
@@ -40,9 +40,7 @@ def reorder_image(img, input_order="HWC"):
     """
 
     if input_order not in ["HWC", "CHW"]:
-        raise ValueError(
-            f"Wrong input_order {input_order}. Supported input_orders are "
-            "'HWC' and 'CHW'")
+        raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " "'HWC' and 'CHW'")
     if len(img.shape) == 2:
         img = img[..., None]
     if input_order == "CHW":
@@ -68,12 +66,9 @@ def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs):
         float: psnr result.
     """
 
-    assert (img.shape == img2.shape
-            ), f"Image shapes are different: {img.shape}, {img2.shape}."
+    assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}."
     if input_order not in ["HWC", "CHW"]:
-        raise ValueError(
-            f"Wrong input_order {input_order}. Supported input_orders are "
-            '"HWC" and "CHW"')
+        raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"')
     img = reorder_image(img, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
     img = img.astype(np.float64)
@@ -83,7 +78,7 @@ def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs):
         img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
         img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
 
-    mse = np.mean((img - img2)**2)
+    mse = np.mean((img - img2) ** 2)
     if mse == 0:
         return float("inf")
     return 20.0 * np.log10(255.0 / np.sqrt(mse))
@@ -102,8 +97,8 @@ def _ssim(img, img2):
         float: ssim result.
     """
 
-    c1 = (0.01 * 255)**2
-    c2 = (0.03 * 255)**2
+    c1 = (0.01 * 255) ** 2
+    c2 = (0.03 * 255) ** 2
 
     img = img.astype(np.float64)
     img2 = img2.astype(np.float64)
@@ -119,8 +114,7 @@ def _ssim(img, img2):
     sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
     sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
 
-    ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / (
-        (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
+    ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
     return ssim_map.mean()
 
 
@@ -149,12 +143,9 @@ def calculate_ssim(img, img2, crop_border, input_order="HWC", **kwargs):
         float: ssim result.
     """
 
-    assert (img.shape == img2.shape
-            ), f"Image shapes are different: {img.shape}, {img2.shape}."
+    assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}."
     if input_order not in ["HWC", "CHW"]:
-        raise ValueError(
-            f"Wrong input_order {input_order}. Supported input_orders are "
-            '"HWC" and "CHW"')
+        raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"')
     img = reorder_image(img, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
     img = img.astype(np.float64)
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
index d466ef6155819..d239d53cf5fcf 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
@@ -53,8 +53,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("emb_layers.1", "time_emb_proj")
         new_item = new_item.replace("skip_connection", "conv_shortcut")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -70,8 +69,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -113,8 +111,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -122,21 +119,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
     to them. It splits attention layers, and takes into account additional replacements
     that may arise.
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -144,13 +140,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
             query, key, value = old_tensor.split(channels // num_heads, dim=1)
 
             checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -161,8 +155,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -172,8 +165,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -202,14 +194,10 @@ def create_vae_diffusers_config(original_config):
     decoder_vae_params = original_config.model.params.ddconfig.decoder
     vae_params = decoder_vae_params
 
-    encoder_block_out_channels = [
-        encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult
-    ]
+    encoder_block_out_channels = [encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult]
     down_block_types = ["DownEncoderBlock2D"] * len(encoder_block_out_channels)
 
-    decoder_block_out_channels = [
-        decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult
-    ]
+    decoder_block_out_channels = [decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult]
     up_block_types = ["UpDecoderBlock2D"] * len(decoder_block_out_channels)
 
     config = dict(
@@ -222,114 +210,82 @@ def create_vae_diffusers_config(original_config):
         down_block_out_channels=tuple(encoder_block_out_channels),
         up_block_out_channels=tuple(decoder_block_out_channels),
         latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks, )
+        layers_per_block=vae_params.num_res_blocks,
+    )
     return config
 
 
 def convert_ldm_vae_checkpoint(vae_state_dict, config):
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -337,58 +293,50 @@ def convert_ldm_vae_checkpoint(vae_state_dict, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -396,14 +344,13 @@ def convert_ldm_vae_checkpoint(vae_state_dict, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint,
-                                              dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -442,7 +389,8 @@ def check_keys(model, state_dict):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--original_config_file",
         default="../config/f8encoder_f16decoder.yaml",
@@ -453,13 +401,15 @@ def check_keys(model, state_dict):
         "--dtype",
         default="float32",
         type=str,
-        help="Dtype of model weights.", )
+        help="Dtype of model weights.",
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
+        help="Path to the output model.",
+    )
 
     args = parser.parse_args()
 
@@ -469,11 +419,9 @@ def check_keys(model, state_dict):
     vae_config = create_vae_diffusers_config(original_config)
 
     # 1. convert vae encoder and decoder
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
-                                                          vae_config)
+    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     vae = AutoencoderKL.from_config(vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        vae, diffusers_vae_checkpoint, args.dtype)
+    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint, args.dtype)
 
     # 2. convert losses
     maps = {
@@ -491,7 +439,7 @@ def check_keys(model, state_dict):
                 k = k.replace(old, new)
             # paddle donot support 0d tensor
             if v.ndim == 0:
-                v = v.reshape((1, ))
+                v = v.reshape((1,))
             # rename
             if "perceptual_loss.lin" in k:
                 k = k.replace("perceptual_loss.lin", "perceptual_loss.lins.")
@@ -501,5 +449,4 @@ def check_keys(model, state_dict):
     check_keys(vae, ppdiffusers_vae_checkpoint)
     vae.save_config(args.dump_path)
     # 4. save state_dict
-    paddle.save(ppdiffusers_vae_checkpoint,
-                os.path.join(args.dump_path, "model_state.pdparams"))
+    paddle.save(ppdiffusers_vae_checkpoint, os.path.join(args.dump_path, "model_state.pdparams"))
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
index 6bc24b3d88bab..0e7e08a580299 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
@@ -67,35 +67,28 @@ def tqdm(x):
 from inception import InceptionV3
 
 parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument(
-    "--batch-size", type=int, default=50, help="Batch size to use")
+parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use")
 parser.add_argument(
     "--num-workers",
     type=int,
-    help=("Number of processes to use for data loading. "
-          "Defaults to `min(8, num_cpus)`"), )
-parser.add_argument(
-    "--device",
-    type=str,
-    default=None,
-    help="Device to use. Like gpu, gpu:0 or cpu")
+    help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
+)
+parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu, gpu:0 or cpu")
 parser.add_argument(
     "--dims",
     type=int,
     default=2048,
     choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
-    help=("Dimensionality of Inception features to use. "
-          "By default, uses pool3 features"), )
+    help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"),
+)
 parser.add_argument(
     "path",
     type=str,
     nargs=2,
-    help=("Paths to the generated images or "
-          "to .npz statistic files"), )
+    help=("Paths to the generated images or " "to .npz statistic files"),
+)
 
-IMAGE_EXTENSIONS = {
-    "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"
-}
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
 
 
 class ImagePathDataset(paddle.io.Dataset):
@@ -136,8 +129,7 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
     model.eval()
 
     if batch_size > len(files):
-        print(("Warning: batch size is bigger than the data size. "
-               "Setting batch size to data size"))
+        print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size"))
         batch_size = len(files)
 
     dataset = ImagePathDataset(files, transforms=TF.ToTensor())
@@ -146,7 +138,8 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
         batch_size=batch_size,
         shuffle=False,
         drop_last=False,
-        num_workers=num_workers, )
+        num_workers=num_workers,
+    )
 
     pred_arr = np.empty((len(files), dims))
 
@@ -165,7 +158,7 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
 
         pred = pred.squeeze(3).squeeze(2).cpu().numpy()
 
-        pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
 
         start_idx = start_idx + pred.shape[0]
 
@@ -200,18 +193,15 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
     sigma1 = np.atleast_2d(sigma1)
     sigma2 = np.atleast_2d(sigma2)
 
-    assert (mu1.shape == mu2.shape
-            ), "Training and test mean vectors have different lengths"
-    assert (sigma1.shape == sigma2.shape
-            ), "Training and test covariances have different dimensions"
+    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
+    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
 
     diff = mu1 - mu2
 
     # Product might be almost singular
     covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
     if not np.isfinite(covmean).all():
-        msg = ("fid calculation produces singular product; "
-               "adding %s to diagonal of cov estimates") % eps
+        msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
         print(msg)
         offset = np.eye(sigma1.shape[0]) * eps
         covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
@@ -228,11 +218,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
     return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
 
 
-def calculate_activation_statistics(files,
-                                    model,
-                                    batch_size=50,
-                                    dims=2048,
-                                    num_workers=1):
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1):
     """Calculation of the statistics used by the FID.
     Params:
     -- files       : List of image files paths
@@ -261,13 +247,8 @@ def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1):
             m, s = f["mu"][:], f["sigma"][:]
     else:
         path = pathlib.Path(path)
-        files = sorted([
-            file
-            for ext in IMAGE_EXTENSIONS
-            for file in path.glob("*.{}".format(ext))
-        ])
-        m, s = calculate_activation_statistics(files, model, batch_size, dims,
-                                               num_workers)
+        files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))])
+        m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers)
 
     return m, s
 
@@ -282,10 +263,8 @@ def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1):
 
     model = InceptionV3([block_idx])
 
-    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims,
-                                        num_workers)
-    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims,
-                                        num_workers)
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers)
+    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers)
     fid_value = calculate_frechet_distance(m1, s1, m2, s2)
 
     return fid_value
@@ -302,8 +281,7 @@ def main():
     else:
         num_workers = args.num_workers
 
-    fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims,
-                                          num_workers)
+    fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims, num_workers)
     print("FID: ", fid_value)
 
 
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
index 3eb58b8b7de40..7e5eadaf365b2 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
@@ -25,15 +25,16 @@
 
 from ppdiffusers import AutoencoderKL, StableDiffusionImg2ImgPipeline
 
-image_processing = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize(0.5, 0.5),
-])
+image_processing = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize(0.5, 0.5),
+    ]
+)
 
 
 def decode_image(image):
-    image = (image / 2 + 0.5).clip(0, 1).transpose(
-        [0, 2, 3, 1]).cast("float32").numpy()
+    image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]).cast("float32").numpy()
     image = StableDiffusionImg2ImgPipeline.numpy_to_pil(image)
     return image
 
@@ -62,8 +63,7 @@ def main(vae_path, src_size, tgt_size, imgs, outdir):
             z = model.encode(img).latent_dist.sample()
             recon = model.decode(z).sample
 
-            decode_image(recon)[0].save(
-                osp.join(outdir, osp.basename(img_path)))
+            decode_image(recon)[0].save(osp.join(outdir, osp.basename(img_path)))
 
 
 if __name__ == "__main__":
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
index 9aecdf265779a..bbdff9a933432 100644
--- a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
+++ b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
@@ -21,7 +21,8 @@
 # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
 FID_WEIGHTS_URL = (
     "https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams",
-    "8e2ae24c34c5c8b81d45167bb9361f4c", )
+    "8e2ae24c34c5c8b81d45167bb9361f4c",
+)
 WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams"
 
 
@@ -47,17 +48,18 @@ class ConvNormActivation(nn.Sequential):
     """
 
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=1,
-            padding=None,
-            groups=1,
-            norm_layer=nn.BatchNorm2D,
-            activation_layer=nn.ReLU,
-            dilation=1,
-            bias=None, ):
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=None,
+        groups=1,
+        norm_layer=nn.BatchNorm2D,
+        activation_layer=nn.ReLU,
+        dilation=1,
+        bias=None,
+    ):
         if padding is None:
             padding = (kernel_size - 1) // 2 * dilation
         if bias is None:
@@ -71,7 +73,8 @@ def __init__(
                 padding,
                 dilation=dilation,
                 groups=groups,
-                bias_attr=bias, )
+                bias_attr=bias,
+            )
         ]
         if norm_layer is not None:
             # The hyperparameter of BatchNorm2D is different from PaddlePaddle.
@@ -97,12 +100,13 @@ class InceptionV3(nn.Layer):
     }
 
     def __init__(
-            self,
-            output_blocks=(DEFAULT_BLOCK_INDEX, ),
-            resize_input=True,
-            normalize_input=True,
-            requires_grad=False,
-            use_fid_inception=True, ):
+        self,
+        output_blocks=(DEFAULT_BLOCK_INDEX,),
+        resize_input=True,
+        normalize_input=True,
+        requires_grad=False,
+        use_fid_inception=True,
+    ):
         """Build pretrained InceptionV3
 
         Parameters
@@ -211,8 +215,7 @@ def forward(self, inp):
         outp = []
         x = inp
         if self.resize_input:
-            x = F.interpolate(
-                x, size=(299, 299), mode="bilinear", align_corners=False)
+            x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
 
         if self.normalize_input:
             x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
@@ -235,8 +238,7 @@ def hack_bn_layer(layer):
 
 def _inception_v3(*args, **kwargs):
     """Wraps `paddle.vision.models.inception_v3`"""
-    return paddle.vision.models.inception_v3(*args,
-                                             **kwargs).apply(hack_bn_layer)
+    return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer)
 
 
 def fid_inception_v3():
@@ -248,8 +250,7 @@ def fid_inception_v3():
     This method first constructs paddle.vision's Inception and then patches the
     necessary parts that are different in the FID Inception model.
     """
-    inception = _inception_v3(
-        num_classes=1008, with_pool=True, pretrained=False)
+    inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False)
     inception.inception_block_list[0] = InceptionA(192, pool_features=32)
     inception.inception_block_list[1] = InceptionA(256, pool_features=64)
     inception.inception_block_list[2] = InceptionA(288, pool_features=64)
@@ -260,8 +261,7 @@ def fid_inception_v3():
     inception.inception_block_list[9] = InceptionE_1(1280)
     inception.inception_block_list[10] = InceptionE_2(2048)
 
-    weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0],
-                                            FID_WEIGHTS_URL[1])
+    weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1])
     state_dict = paddle.load(weight_path)
     inception.set_state_dict(state_dict)
     return inception
@@ -275,49 +275,55 @@ def __init__(self, num_channels, pool_features):
             out_channels=64,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch5x5_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=48,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch5x5_2 = ConvNormActivation(
             in_channels=48,
             out_channels=64,
             kernel_size=5,
             padding=2,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch3x3dbl_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=64,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_2 = ConvNormActivation(
             in_channels=64,
             out_channels=96,
             kernel_size=3,
             padding=1,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_3 = ConvNormActivation(
             in_channels=96,
             out_channels=96,
             kernel_size=3,
             padding=1,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        self.branch_pool = nn.AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=True)
+        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
         self.branch_pool_conv = ConvNormActivation(
             in_channels=num_channels,
             out_channels=pool_features,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -330,8 +336,7 @@ def forward(self, x):
 
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
-        x = paddle.concat(
-            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
         return x
 
 
@@ -343,7 +348,8 @@ def __init__(self, num_channels, channels_7x7):
             out_channels=192,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch7x7_1 = ConvNormActivation(
             in_channels=num_channels,
@@ -351,62 +357,70 @@ def __init__(self, num_channels, channels_7x7):
             kernel_size=1,
             stride=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7_2 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(1, 7),
             stride=1,
             padding=(0, 3),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7_3 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=192,
             kernel_size=(7, 1),
             stride=1,
             padding=(3, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch7x7dbl_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=channels_7x7,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_2 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(7, 1),
             padding=(3, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_3 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(1, 7),
             padding=(0, 3),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_4 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(7, 1),
             padding=(3, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_5 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=192,
             kernel_size=(1, 7),
             padding=(0, 3),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        self.branch_pool = nn.AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=True)
+        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
         self.branch_pool_conv = ConvNormActivation(
             in_channels=num_channels,
             out_channels=192,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -424,8 +438,7 @@ def forward(self, x):
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
 
         return x
 
@@ -438,61 +451,69 @@ def __init__(self, num_channels):
             out_channels=320,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=384,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3_2a = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(1, 3),
             padding=(0, 1),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3_2b = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(3, 1),
             padding=(1, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch3x3dbl_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=448,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_2 = ConvNormActivation(
             in_channels=448,
             out_channels=384,
             kernel_size=3,
             padding=1,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_3a = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(1, 3),
             padding=(0, 1),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_3b = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(3, 1),
             padding=(1, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        self.branch_pool = nn.AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=True)
+        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
         self.branch_pool_conv = ConvNormActivation(
             in_channels=num_channels,
             out_channels=192,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -515,8 +536,7 @@ def forward(self, x):
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
         return x
 
 
@@ -549,6 +569,5 @@ def forward(self, x):
         branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
         return x
diff --git a/ppdiffusers/examples/autoencoder/vae/train_vae.py b/ppdiffusers/examples/autoencoder/vae/train_vae.py
index e96c6718040c0..44a8798100e3a 100644
--- a/ppdiffusers/examples/autoencoder/vae/train_vae.py
+++ b/ppdiffusers/examples/autoencoder/vae/train_vae.py
@@ -28,8 +28,7 @@
 from tqdm.auto import tqdm
 
 from ppdiffusers.models.ema import LitEma
-from ppdiffusers.training_utils import (freeze_params, main_process_first,
-                                        unwrap_model)
+from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model
 
 
 def read_json(file):
@@ -56,8 +55,7 @@ def run_evaluate(vae, val_dataloader, writer, global_step):
     log_dict_ae_all = defaultdict(list)
     log_dict_disc_all = defaultdict(list)
     for batch in val_dataloader:
-        log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step(
-            batch["image"], global_step=global_step)
+        log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step(batch["image"], global_step=global_step)
         for k, v in log_dict_ae.items():
             if "loss" not in k:
                 continue
@@ -71,25 +69,21 @@ def run_evaluate(vae, val_dataloader, writer, global_step):
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training a autoencoder model script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training a autoencoder model script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
         default=None,
         required=False,
-        help="Path to pretrained model or model identifier from bos.", )
+        help="Path to pretrained model or model identifier from bos.",
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
         default="autoencoder_outputs",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=23,
-        help="A seed for reproducible training.")
+    parser.add_argument("--seed", type=int, default=23, help="A seed for reproducible training.")
     parser.add_argument(
         "--batch_size",
         type=int,
@@ -112,48 +106,39 @@ def parse_args():
     parser.add_argument(
         "--scale_lr",
         action="store_true",
-        help="Scale base-lr by ngpu * batch_size", )
-    parser.add_argument(
-        "--freeze_encoder",
-        action="store_true",
-        help="Whether to freeze encoder layer.")
+        help="Scale base-lr by ngpu * batch_size",
+    )
+    parser.add_argument("--freeze_encoder", action="store_true", help="Whether to freeze encoder layer.")
     parser.add_argument(
         "--from_scratch",
         action="store_true",
-        help="Whether to train new model from scratch. ", )
-    parser.add_argument(
-        "--vae_config_file",
-        default=None,
-        type=str,
-        help="Path to the vae_config_file.")
+        help="Whether to train new model from scratch. ",
+    )
+    parser.add_argument("--vae_config_file", default=None, type=str, help="Path to the vae_config_file.")
     parser.add_argument(
         "--logging_dir",
         type=str,
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"), )
+            "*output_dir/logs"
+        ),
+    )
     parser.add_argument(
         "--report_to",
         type=str,
         default="visualdl",
         choices=["tensorboard", "visualdl"],
-        help="Log writer type.", )
-    parser.add_argument(
-        "--logging_steps",
-        default=100,
-        type=int,
-        help="The interval steps to logging.")
+        help="Log writer type.",
+    )
+    parser.add_argument("--logging_steps", default=100, type=int, help="The interval steps to logging.")
     parser.add_argument(
         "--image_logging_steps",
         default=500,
         type=int,
-        help="The interval steps to logging images.", )
-    parser.add_argument(
-        "--save_steps",
-        default=2000,
-        type=int,
-        help="The interval steps to saveing.")
+        help="The interval steps to logging images.",
+    )
+    parser.add_argument("--save_steps", default=2000, type=int, help="The interval steps to saveing.")
     parser.add_argument(
         "--ignore_keys",
         default=[],
@@ -166,136 +151,152 @@ def parse_args():
         default=None,
         type=int,
         nargs="*",
-        help="The height and width of the input at the encoder.", )
+        help="The height and width of the input at the encoder.",
+    )
     # dataset
     parser.add_argument(
         "--dataset_type",
         type=str,
         default="text_image_pair",
         choices=["imagenet", "text_image_pair"],
-        help="The type of dataset.", )
+        help="The type of dataset.",
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         default=512,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--degradation",
         type=str,
         default="pil_nearest",
-        help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest", )
+        help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest",
+    )
     parser.add_argument(
         "--file_list",
         type=str,
         default="./data/filelist/train.filelist.list",
-        help="Path to the train file_list.", )
+        help="Path to the train file_list.",
+    )
     parser.add_argument(
         "--num_workers",
         type=int,
         default=8,
-        help="The number of subprocess to load data.", )
+        help="The number of subprocess to load data.",
+    )
     parser.add_argument(
         "--num_records",
         type=int,
         default=62500,
-        help="The num_records of the text_image_pair dataset.", )
+        help="The num_records of the text_image_pair dataset.",
+    )
     parser.add_argument(
         "--buffer_size",
         type=int,
         default=100,
-        help="The buffer size of the text_image_pair dataset.", )
+        help="The buffer size of the text_image_pair dataset.",
+    )
     parser.add_argument(
         "--shuffle_every_n_samples",
         type=int,
         default=5,
-        help="The shuffle_every_n_samples of the text_image_pair dataset.", )
+        help="The shuffle_every_n_samples of the text_image_pair dataset.",
+    )
     parser.add_argument(
         "--init_from_ckpt",
         type=str,
         default=None,
-        help="The path of checkpoint to be loaded.", )
+        help="The path of checkpoint to be loaded.",
+    )
 
     # loss fn
     parser.add_argument(
         "--disc_start",
         type=int,
         default=50001,
-        help="The number of steps the discriminator started.", )
+        help="The number of steps the discriminator started.",
+    )
     parser.add_argument(
         "--kl_weight",
         type=float,
         default=1.0e-6,
-        help="The weight ratio of the kl_loss.", )
+        help="The weight ratio of the kl_loss.",
+    )
     parser.add_argument(
         "--disc_weight",
         type=float,
         default=0.5,
-        help="The weight ratio of the disc_loss.", )
+        help="The weight ratio of the disc_loss.",
+    )
     parser.add_argument(
         "--logvar_init",
         type=float,
         default=0.0,
-        help="The init value of the output log variances.", )
+        help="The init value of the output log variances.",
+    )
     parser.add_argument(
         "--pixelloss_weight",
         type=float,
         default=1.0,
-        help="The weight ratio of the pixelloss.", )
+        help="The weight ratio of the pixelloss.",
+    )
     parser.add_argument(
         "--disc_num_layers",
         type=int,
         default=3,
-        help="The num layers of the discriminator.", )
+        help="The num layers of the discriminator.",
+    )
     parser.add_argument(
         "--disc_in_channels",
         type=int,
         default=3,
-        help="The in channels of the discriminator.", )
+        help="The in channels of the discriminator.",
+    )
     parser.add_argument(
         "--disc_factor",
         type=float,
         default=1.0,
-        help="The factor of the discriminator loss.", )
+        help="The factor of the discriminator loss.",
+    )
     parser.add_argument(
         "--perceptual_weight",
         type=float,
         default=1.0,
-        help="The weight ratio of the perceptual loss.", )
+        help="The weight ratio of the perceptual loss.",
+    )
     parser.add_argument(
         "--use_actnorm",
         action="store_true",
-        help="Whether to use actnorm in NLayerDiscriminator layer.", )
+        help="Whether to use actnorm in NLayerDiscriminator layer.",
+    )
     parser.add_argument(
         "--disc_conditional",
         action="store_true",
-        help="Whether to use conditional discriminator.", )
+        help="Whether to use conditional discriminator.",
+    )
     parser.add_argument(
         "--disc_loss",
         type=str,
         choices=["hinge", "vanilla"],
         default="hinge",
-        help="The type of discriminator loss.", )
-    parser.add_argument(
-        "--use_ema", action="store_true", help="Whether to use_ema.")
+        help="The type of discriminator loss.",
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use_ema.")
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether to enable_xformers_memory_efficient_attention.", )
-    parser.add_argument(
-        "--recompute", action="store_true", help="Whether to recompute.")
-    parser.add_argument(
-        "--ema_decay",
-        type=float,
-        default=0.9999,
-        help="The value of ema_decay.")
+        help="Whether to enable_xformers_memory_efficient_attention.",
+    )
+    parser.add_argument("--recompute", action="store_true", help="Whether to recompute.")
+    parser.add_argument("--ema_decay", type=float, default=0.9999, help="The value of ema_decay.")
     args = parser.parse_args()
 
     args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    args.image_logging_steps = (
-        math.ceil(args.image_logging_steps / args.logging_steps) *
-        args.logging_steps)
+    args.image_logging_steps = math.ceil(args.image_logging_steps / args.logging_steps) * args.logging_steps
 
     return args
 
@@ -358,7 +359,8 @@ def main():
             disc_loss=args.disc_loss,
             ema_decay=args.ema_decay,
             use_ema=args.use_ema,
-            **model_kwargs, )
+            **model_kwargs,
+        )
     else:
         assert args.vae_config_file is not None, "We must supply vae_config_file!"
         # Load config: train model from scatch
@@ -378,7 +380,8 @@ def main():
             disc_conditional=args.disc_conditional,
             disc_loss=args.disc_loss,
             ema_decay=args.ema_decay,
-            use_ema=args.use_ema, )
+            use_ema=args.use_ema,
+        )
 
     if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
         state_dict = paddle.load(args.init_from_ckpt)
@@ -390,8 +393,7 @@ def main():
         args.learning_rate = num_processes * args.batch_size * args.learning_rate
 
     # configure_optimizers
-    parameters = list(vae.decoder.parameters()) + list(
-        vae.post_quant_conv.parameters())
+    parameters = list(vae.decoder.parameters()) + list(vae.post_quant_conv.parameters())
     # we may freeze_encoder
     if not args.freeze_encoder:
         parameters += list(vae.encoder.parameters())
@@ -401,16 +403,13 @@ def main():
         freeze_params(vae.quant_conv.parameters())
         print("Freeze vae.encoder.parameters and vae.quant_conv.parameters!")
 
-    opt_ae = Adam(
-        parameters=parameters,
-        learning_rate=args.learning_rate,
-        beta1=0.5,
-        beta2=0.9)
+    opt_ae = Adam(parameters=parameters, learning_rate=args.learning_rate, beta1=0.5, beta2=0.9)
     opt_disc = Adam(
         parameters=vae.loss.discriminator.parameters(),
         learning_rate=args.learning_rate,
         beta1=0.5,
-        beta2=0.9, )
+        beta2=0.9,
+    )
     if args.use_ema:
         vae.model_ema = LitEma(vae, decay=args.ema_decay)
     if args.recompute:
@@ -427,27 +426,17 @@ def main():
         from ldm import ImageNetSRTrain, ImageNetSRValidation
 
         with main_process_first():
-            train_dataset = ImageNetSRTrain(
-                size=args.resolution, degradation=args.degradation)
-            val_dataset = ImageNetSRValidation(
-                size=args.resolution, degradation=args.degradation)
-        train_sampler = (DistributedBatchSampler(
-            train_dataset, batch_size=args.batch_size, shuffle=True)
-                         if num_processes > 1 else BatchSampler(
-                             train_dataset,
-                             batch_size=args.batch_size,
-                             shuffle=True))
-        train_dataloader = DataLoader(
-            train_dataset,
-            batch_sampler=train_sampler,
-            num_workers=args.num_workers)
-
-        val_sampler = BatchSampler(
-            val_dataset, batch_size=args.batch_size * 2, shuffle=False)
-        val_dataloader = DataLoader(
-            val_dataset,
-            batch_sampler=val_sampler,
-            num_workers=args.num_workers)
+            train_dataset = ImageNetSRTrain(size=args.resolution, degradation=args.degradation)
+            val_dataset = ImageNetSRValidation(size=args.resolution, degradation=args.degradation)
+        train_sampler = (
+            DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
+            if num_processes > 1
+            else BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
+        )
+        train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=args.num_workers)
+
+        val_sampler = BatchSampler(val_dataset, batch_size=args.batch_size * 2, shuffle=False)
+        val_dataloader = DataLoader(val_dataset, batch_sampler=val_sampler, num_workers=args.num_workers)
     else:
         train_dataset = TextImagePair(
             file_list=args.file_list,
@@ -455,19 +444,21 @@ def main():
             num_records=args.num_records,
             buffer_size=args.buffer_size,
             shuffle_every_n_samples=args.shuffle_every_n_samples,
-            interpolation="lanczos", )
+            interpolation="lanczos",
+        )
 
         train_dataloader = DataLoader(
             train_dataset,
             batch_size=args.batch_size,
             num_workers=args.num_workers,
-            worker_init_fn=worker_init_fn, )
+            worker_init_fn=worker_init_fn,
+        )
         val_dataloader = val_dataset = None
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = (
-        len(train_dataloader) if args.dataset_type == "imagenet" else
-        math.ceil(len(train_dataset) / args.batch_size))
+        len(train_dataloader) if args.dataset_type == "imagenet" else math.ceil(len(train_dataset) / args.batch_size)
+    )
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
@@ -475,8 +466,7 @@ def main():
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps /
-                                      num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     if rank == 0:
         logger.info("-----------  Configuration Arguments -----------")
@@ -492,9 +482,7 @@ def main():
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
     logger.info(f"  Instantaneous batch size per device = {args.batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed) = {total_batch_size}"
-    )
+    logger.info(f"  Total train batch size (w. parallel, distributed) = {total_batch_size}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
     logger.info(
         f"  Number of trainable parameters = {sum(p.numel().item() for p in vae.parameters() if not p.stop_gradient) }"
@@ -515,9 +503,7 @@ def main():
                 # pytorch_lightning use this `toggle_optimizer` method
                 # ref: https://github.com/Lightning-AI/lightning/blob/a58639ce7e864dd70484e7d34c37730ae204183c/src/pytorch_lightning/core/module.py#L1419-L1447
                 unwrap_model(vae).toggle_optimizer(optimizers, optimizer_idx)
-                loss, log_dict = vae(batch["image"],
-                                     optimizer_idx=optimizer_idx,
-                                     global_step=global_step)
+                loss, log_dict = vae(batch["image"], optimizer_idx=optimizer_idx, global_step=global_step)
                 optimizers[optimizer_idx].clear_grad()
                 loss.backward()
                 optimizers[optimizer_idx].step()
@@ -541,17 +527,13 @@ def main():
                 if global_step % args.image_logging_steps == 0:
                     images_log = unwrap_model(vae).log_images(batch["image"])
                     for name, val in images_log.items():
-                        writer.add_image(
-                            name, val, global_step, dataformats="NHWC")
+                        writer.add_image(name, val, global_step, dataformats="NHWC")
 
                 # saving
                 if global_step % args.save_steps == 0:
                     if val_dataloader is not None:
-                        run_evaluate(
-                            unwrap_model(vae), val_dataloader, writer,
-                            global_step)
-                    output_dir = os.path.join(
-                        args.output_dir, "checkpoint-{}".format(global_step))
+                        run_evaluate(unwrap_model(vae), val_dataloader, writer, global_step)
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     unwrap_model(vae).save_pretrained(output_dir)
 
             del logs
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
index 0c715dcb16fff..0c943be785d26 100644
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
+++ b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
@@ -13,19 +13,15 @@
 
 
 @patch_to(BeamHypotheses)
-def add(self: BeamHypotheses,
-        hyp: paddle.Tensor,
-        sum_logprobs: float,
-        origin_len: int=0) -> None:
+def add(self: BeamHypotheses, hyp: paddle.Tensor, sum_logprobs: float, origin_len: int = 0) -> None:
     """
     Add a new hypothesis to the list.
     """
-    score = sum_logprobs / (hyp.shape[-1]**self.length_penalty)
+    score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
     if len(self) < self.num_beams or score > self.worst_score:
         self.beams.append((score, hyp))
         if len(self) > self.num_beams:
-            sorted_next_scores = sorted(
-                [(s, idx) for idx, (s, _) in enumerate(self.beams)])
+            sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
             del self.beams[sorted_next_scores[0][1]]
             self.worst_score = sorted_next_scores[1][0]
         else:
@@ -33,10 +29,7 @@ def add(self: BeamHypotheses,
 
 
 @patch_to(BeamHypotheses)
-def is_done(self: BeamHypotheses,
-            best_sum_logprobs: float,
-            cur_len: int,
-            origin_len: int=0) -> bool:
+def is_done(self: BeamHypotheses, best_sum_logprobs: float, cur_len: int, origin_len: int = 0) -> bool:
     """
     If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
     one in the heap, then we are done with this sentence.
@@ -54,35 +47,31 @@ def is_done(self: BeamHypotheses,
 
 class BLIP_Decoder(nn.Layer):
     def __init__(
-            self,
-            pretrained_model_name_or_path,
-            prompt="a picture of ", ):
+        self,
+        pretrained_model_name_or_path,
+        prompt="a picture of ",
+    ):
         super().__init__()
-        self.text_decoder = BlipForConditionalGeneration.from_pretrained(
-            pretrained_model_name_or_path)
+        self.text_decoder = BlipForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
         self.text_decoder.eval()
-        self.processor = BlipProcessor.from_pretrained(
-            pretrained_model_name_or_path)
+        self.processor = BlipProcessor.from_pretrained(pretrained_model_name_or_path)
         self.processor.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-        self.processor.tokenizer.add_special_tokens({
-            "additional_special_tokens": ["[ENC]"]
-        })
-        self.processor.tokenizer.enc_token_id = (
-            self.processor.tokenizer.additional_special_tokens_ids[0])
+        self.processor.tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]})
+        self.processor.tokenizer.enc_token_id = self.processor.tokenizer.additional_special_tokens_ids[0]
         self.prompt = prompt
-        self.prompt_length = len(
-            self.processor.tokenizer(self.prompt).input_ids) - 1
+        self.prompt_length = len(self.processor.tokenizer(self.prompt).input_ids) - 1
 
     def generate(
-            self,
-            image,
-            prompt=None,
-            sample=False,
-            num_beams=3,
-            max_length=30,
-            min_length=10,
-            top_p=0.9,
-            repetition_penalty=1.0, ):
+        self,
+        image,
+        prompt=None,
+        sample=False,
+        num_beams=3,
+        max_length=30,
+        min_length=10,
+        top_p=0.9,
+        repetition_penalty=1.0,
+    ):
         if prompt is None:
             prompt = self.prompt
             prompt_length = self.prompt_length
@@ -93,8 +82,7 @@ def generate(
         else:
             model_kwargs = {"pixel_values": image}
         prompt = [prompt] * model_kwargs["pixel_values"].shape[0]
-        input_ids = self.processor.tokenizer(
-            prompt, return_tensors="pd").input_ids
+        input_ids = self.processor.tokenizer(prompt, return_tensors="pd").input_ids
 
         if sample:
             # nucleus sampling
@@ -106,7 +94,8 @@ def generate(
                 top_p=top_p,
                 num_return_sequences=1,
                 repetition_penalty=repetition_penalty,
-                **model_kwargs, )[0]
+                **model_kwargs,
+            )[0]
         else:
             if num_beams == 1:
                 # greedy search
@@ -115,7 +104,8 @@ def generate(
                     max_length=max_length - prompt_length,
                     min_length=min_length,
                     decode_strategy="greedy_search",
-                    **model_kwargs, )[0]
+                    **model_kwargs,
+                )[0]
             else:
                 # beam search
                 outputs = self.text_decoder.generate(
@@ -126,11 +116,10 @@ def generate(
                     decode_strategy="beam_search",
                     repetition_penalty=repetition_penalty,
                     length_penalty=1.0,  # note this is not
-                    **model_kwargs, )[0]
+                    **model_kwargs,
+                )[0]
 
         captions = []
         for output in outputs:
-            captions.append(
-                self.processor.decode(
-                    output, skip_special_tokens=True))
+            captions.append(self.processor.decode(output, skip_special_tokens=True))
         return captions
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
index 380024d3d617e..9cefe1a3b543d 100644
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
+++ b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
@@ -63,19 +63,16 @@ def __init__(self, config: Config):
 
     def load_blip_model(self):
         config = self.config
-        self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config.
-                                       blip_pretrained_model_name_or_path)
+        self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config.blip_pretrained_model_name_or_path)
         self.blip_model.eval()
 
     def load_clip_model(self):
         config = self.config
 
         # clip model
-        self.clip_model: CLIPModel = CLIPModel.from_pretrained(
-            config.clip_pretrained_model_name_or_path)
+        self.clip_model: CLIPModel = CLIPModel.from_pretrained(config.clip_pretrained_model_name_or_path)
         self.clip_model.eval()
-        self.clip_preprocess = CLIPProcessor.from_pretrained(
-            config.clip_pretrained_model_name_or_path)
+        self.clip_preprocess = CLIPProcessor.from_pretrained(config.clip_pretrained_model_name_or_path)
 
         sites = [
             "Artstation",
@@ -113,41 +110,45 @@ def load_clip_model(self):
             return_tensors="pd",
             padding="max_length",
             truncation=True,
-            max_length=self.clip_preprocess.tokenizer.model_max_length, )
-        self.artists = LabelTable(artists, "artists", self.clip_model,
-                                  self.tokenize, config)
+            max_length=self.clip_preprocess.tokenizer.model_max_length,
+        )
+        self.artists = LabelTable(artists, "artists", self.clip_model, self.tokenize, config)
         self.flavors = LabelTable(
             _load_list(config.data_path, "flavors.txt"),
             "flavors",
             self.clip_model,
             self.tokenize,
-            config, )
+            config,
+        )
         self.mediums = LabelTable(
             _load_list(config.data_path, "mediums.txt"),
             "mediums",
             self.clip_model,
             self.tokenize,
-            config, )
+            config,
+        )
         self.movements = LabelTable(
             _load_list(config.data_path, "movements.txt"),
             "movements",
             self.clip_model,
             self.tokenize,
-            config, )
-        self.trendings = LabelTable(trending_list, "trendings", self.clip_model,
-                                    self.tokenize, config)
+            config,
+        )
+        self.trendings = LabelTable(trending_list, "trendings", self.clip_model, self.tokenize, config)
         self.pad_token_id = self.clip_preprocess.tokenizer.pad_token_id
 
     def generate_caption(self, pil_image: Image) -> str:
         size = self.config.blip_image_eval_size
-        gpu_image = transforms.Compose([
-            transforms.Resize(
-                (size, size), interpolation="bicubic"),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                self.clip_preprocess.image_processor.image_mean,
-                self.clip_preprocess.image_processor.image_std, ),
-        ])(pil_image).unsqueeze(0)
+        gpu_image = transforms.Compose(
+            [
+                transforms.Resize((size, size), interpolation="bicubic"),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    self.clip_preprocess.image_processor.image_mean,
+                    self.clip_preprocess.image_processor.image_std,
+                ),
+            ]
+        )(pil_image).unsqueeze(0)
 
         with paddle.no_grad():
             caption = self.blip_model.generate(
@@ -157,18 +158,18 @@ def generate_caption(self, pil_image: Image) -> str:
                 max_length=self.config.blip_max_length,
                 min_length=self.config.blip_min_length,
                 top_p=self.config.blip_top_p,
-                repetition_penalty=self.config.blip_repetition_penalty, )
+                repetition_penalty=self.config.blip_repetition_penalty,
+            )
         return caption[0]
 
     def image_to_features(self, image: Image) -> paddle.Tensor:
         images = self.clip_preprocess(images=image, return_tensors="pd")
         with paddle.no_grad():
-            image_features = self.clip_model.get_image_features(images[
-                "pixel_values"])
+            image_features = self.clip_model.get_image_features(images["pixel_values"])
             image_features /= image_features.norm(axis=-1, keepdim=True)
         return image_features
 
-    def interrogate_classic(self, image: Image, max_flavors: int=3) -> str:
+    def interrogate_classic(self, image: Image, max_flavors: int = 3) -> str:
         caption = self.generate_caption(image)
         image_features = self.image_to_features(image)
 
@@ -185,25 +186,21 @@ def interrogate_classic(self, image: Image, max_flavors: int=3) -> str:
 
         return _truncate_to_fit(prompt, self.tokenize, self.pad_token_id)
 
-    def interrogate_fast(self, image: Image, max_flavors: int=32) -> str:
+    def interrogate_fast(self, image: Image, max_flavors: int = 32) -> str:
         caption = self.generate_caption(image)
         image_features = self.image_to_features(image)
         merged = _merge_tables(
-            [
-                self.artists, self.flavors, self.mediums, self.movements,
-                self.trendings
-            ],
-            self.config, )
+            [self.artists, self.flavors, self.mediums, self.movements, self.trendings],
+            self.config,
+        )
         tops = merged.rank(image_features, max_flavors)
-        return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize,
-                                self.pad_token_id)
+        return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize, self.pad_token_id)
 
-    def interrogate(self, image: Image, max_flavors: int=32) -> str:
+    def interrogate(self, image: Image, max_flavors: int = 32) -> str:
         caption = self.generate_caption(image)
         image_features = self.image_to_features(image)
 
-        flaves = self.flavors.rank(image_features,
-                                   self.config.flavor_intermediate_count)
+        flaves = self.flavors.rank(image_features, self.config.flavor_intermediate_count)
         best_medium = self.mediums.rank(image_features, 1)[0]
         best_artist = self.artists.rank(image_features, 1)[0]
         best_trending = self.trendings.rank(image_features, 1)[0]
@@ -225,65 +222,50 @@ def check(addition: str) -> bool:
         def check_multi_batch(opts: List[str]):
             nonlocal best_prompt, best_sim
             prompts = []
-            for i in range(2**len(opts)):
+            for i in range(2 ** len(opts)):
                 prompt = best_prompt
                 for bit in range(len(opts)):
                     if i & (1 << bit):
                         prompt += ", " + opts[bit]
                 prompts.append(prompt)
 
-            t = LabelTable(prompts, None, self.clip_model, self.tokenize,
-                           self.config)
+            t = LabelTable(prompts, None, self.clip_model, self.tokenize, self.config)
             best_prompt = t.rank(image_features, 1)[0]
             best_sim = self.similarity(image_features, best_prompt)
 
-        check_multi_batch(
-            [best_medium, best_artist, best_trending, best_movement])
+        check_multi_batch([best_medium, best_artist, best_trending, best_movement])
 
         extended_flavors = set(flaves)
-        for i in tqdm(
-                range(max_flavors), desc="Flavor chain",
-                disable=self.config.quiet):
-            best = self.rank_top(
-                image_features,
-                [f"{best_prompt}, {f}" for f in extended_flavors])
-            flave = best[len(best_prompt) + 2:]
+        for i in tqdm(range(max_flavors), desc="Flavor chain", disable=self.config.quiet):
+            best = self.rank_top(image_features, [f"{best_prompt}, {f}" for f in extended_flavors])
+            flave = best[len(best_prompt) + 2 :]
             if not check(flave):
                 break
-            if _prompt_at_max_len(best_prompt, self.tokenize,
-                                  self.pad_token_id):
+            if _prompt_at_max_len(best_prompt, self.tokenize, self.pad_token_id):
                 break
             extended_flavors.remove(flave)
 
         return best_prompt
 
-    def rank_top(self, image_features: paddle.Tensor,
-                 text_array: List[str]) -> str:
+    def rank_top(self, image_features: paddle.Tensor, text_array: List[str]) -> str:
         text_tokens = self.tokenize(text_array)
         with paddle.no_grad():
-            text_features = self.clip_model.get_text_features(text_tokens[
-                "input_ids"])
+            text_features = self.clip_model.get_text_features(text_tokens["input_ids"])
             text_features /= text_features.norm(axis=-1, keepdim=True)
-            similarity = text_features @image_features.T
+            similarity = text_features @ image_features.T
         return text_array[similarity.argmax().item()]
 
     def similarity(self, image_features: paddle.Tensor, text: str) -> float:
         text_tokens = self.tokenize([text])
         with paddle.no_grad():
-            text_features = self.clip_model.get_text_features(text_tokens[
-                "input_ids"])
+            text_features = self.clip_model.get_text_features(text_tokens["input_ids"])
             text_features /= text_features.norm(axis=-1, keepdim=True)
-            similarity = text_features @image_features.T
+            similarity = text_features @ image_features.T
         return similarity[0][0].item()
 
 
 class LabelTable:
-    def __init__(self,
-                 labels: List[str],
-                 desc: str,
-                 clip_model,
-                 tokenize,
-                 config: Config):
+    def __init__(self, labels: List[str], desc: str, clip_model, tokenize, config: Config):
         self.chunk_size = config.chunk_size
         self.config = config
         self.embeds = []
@@ -295,10 +277,8 @@ def __init__(self,
         cache_filepath = None
         if config.cache_path is not None and desc is not None:
             os.makedirs(config.cache_path, exist_ok=True)
-            sanitized_name = config.clip_pretrained_model_name_or_path.replace(
-                "/", "_").replace("@", "_")
-            cache_filepath = os.path.join(config.cache_path,
-                                          f"{sanitized_name}_{desc}.pkl")
+            sanitized_name = config.clip_pretrained_model_name_or_path.replace("/", "_").replace("@", "_")
+            cache_filepath = os.path.join(config.cache_path, f"{sanitized_name}_{desc}.pkl")
             if desc is not None and os.path.exists(cache_filepath):
                 with open(cache_filepath, "rb") as f:
                     try:
@@ -311,16 +291,15 @@ def __init__(self,
 
         if len(self.labels) != len(self.embeds):
             self.embeds = []
-            chunks = np.array_split(
-                self.labels, max(1, len(self.labels) / config.chunk_size))
+            chunks = np.array_split(self.labels, max(1, len(self.labels) / config.chunk_size))
             for chunk in tqdm(
-                    chunks,
-                    desc=f"Preprocessing {desc}" if desc else None,
-                    disable=self.config.quiet, ):
+                chunks,
+                desc=f"Preprocessing {desc}" if desc else None,
+                disable=self.config.quiet,
+            ):
                 text_tokens = self.tokenize(chunk.tolist())
                 with paddle.no_grad():
-                    text_features = clip_model.get_text_features(text_tokens[
-                        "input_ids"])
+                    text_features = clip_model.get_text_features(text_tokens["input_ids"])
                     text_features /= text_features.norm(axis=-1, keepdim=True)
                     text_features = text_features.cpu().numpy()
                 for i in range(text_features.shape[0]):
@@ -335,22 +314,23 @@ def __init__(self,
                             "hash": hash,
                             "model": config.clip_pretrained_model_name_or_path,
                         },
-                        f, )
+                        f,
+                    )
 
     def _rank(
-            self,
-            image_features: paddle.Tensor,
-            text_embeds: paddle.Tensor,
-            top_count: int=1, ) -> str:
+        self,
+        image_features: paddle.Tensor,
+        text_embeds: paddle.Tensor,
+        top_count: int = 1,
+    ) -> str:
         top_count = min(top_count, len(text_embeds))
         text_embeds = paddle.to_tensor(text_embeds)
-        similarity = image_features @text_embeds.T
+        similarity = image_features @ text_embeds.T
         _, top_labels = similarity.cast("float32").topk(top_count, axis=-1)
         top_labels = top_labels.tolist()
         return [top_labels[0][i] for i in range(top_count)]
 
-    def rank(self, image_features: paddle.Tensor,
-             top_count: int=1) -> List[str]:
+    def rank(self, image_features: paddle.Tensor, top_count: int = 1) -> List[str]:
         if len(self.labels) <= self.chunk_size:
             tops = self._rank(image_features, self.embeds, top_count=top_count)
             return [self.labels[i] for i in tops]
@@ -362,10 +342,7 @@ def rank(self, image_features: paddle.Tensor,
         for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
             start = chunk_idx * self.chunk_size
             stop = min(start + self.chunk_size, len(self.embeds))
-            tops = self._rank(
-                image_features,
-                self.embeds[start:stop],
-                top_count=keep_per_chunk)
+            tops = self._rank(image_features, self.embeds[start:stop], top_count=keep_per_chunk)
             top_labels.extend([self.labels[start + i] for i in tops])
             top_embeds.extend([self.embeds[start + i] for i in tops])
 
@@ -374,11 +351,7 @@ def rank(self, image_features: paddle.Tensor,
 
 
 def _load_list(data_path: str, filename: str) -> List[str]:
-    with open(
-            os.path.join(data_path, filename),
-            "r",
-            encoding="utf-8",
-            errors="replace") as f:
+    with open(os.path.join(data_path, filename), "r", encoding="utf-8", errors="replace") as f:
         items = [line.strip() for line in f.readlines()]
     return items
 
@@ -391,7 +364,7 @@ def _merge_tables(tables: List[LabelTable], config: Config) -> LabelTable:
     return m
 
 
-def _prompt_at_max_len(text: str, tokenize, pad_token_id: int=0) -> bool:
+def _prompt_at_max_len(text: str, tokenize, pad_token_id: int = 0) -> bool:
     tokens = tokenize([text])["input_ids"]
     return tokens[0][-1] != pad_token_id
 
diff --git a/ppdiffusers/examples/clip_interrogator/dumpy.py b/ppdiffusers/examples/clip_interrogator/dumpy.py
index 9a6e930b2e198..552e84eae5944 100644
--- a/ppdiffusers/examples/clip_interrogator/dumpy.py
+++ b/ppdiffusers/examples/clip_interrogator/dumpy.py
@@ -14,9 +14,12 @@
 # limitations under the License.
 
 import gradio as gr
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-                               CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
-                               Interrogator)
+from clip_interrogator import (
+    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    Config,
+    Interrogator,
+)
 
 blip_pretrained_model_name_or_path = "Salesforce/blip-image-captioning-base"
 clip_pretrained_model_name_or_path = "openai/clip-vit-large-patch14"
@@ -38,16 +41,18 @@
 config = Config(
     blip_num_beams=64,
     blip_pretrained_model_name_or_path=blip_pretrained_model_name_or_path,
-    clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path, )
+    clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path,
+)
 ci = Interrogator(config)
 
 
 def inference(image, mode, best_max_flavors=32):
-    ci.config.chunk_size = (2048 if ci.config.clip_pretrained_model_name_or_path
-                            == "openai/clip-vit-large-patch14" else 1024)
+    ci.config.chunk_size = (
+        2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024
+    )
     ci.config.flavor_intermediate_count = (
-        2048 if ci.config.clip_pretrained_model_name_or_path ==
-        "openai/clip-vit-large-patch14" else 1024)
+        2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024
+    )
     image = image.convert("RGB")
     if mode == "best":
         return ci.interrogate(image, max_flavors=int(best_max_flavors))
@@ -59,16 +64,17 @@ def inference(image, mode, best_max_flavors=32):
 
 inputs = [
     gr.inputs.Image(type="pil"),
-    gr.Radio(
-        ["best", "fast", "classic"], label="", value="best"),
-    gr.Number(
-        value=16, label="best mode max flavors"),
+    gr.Radio(["best", "fast", "classic"], label="", value="best"),
+    gr.Number(value=16, label="best mode max flavors"),
+]
+outputs = [
+    gr.outputs.Textbox(label="Output"),
 ]
-outputs = [gr.outputs.Textbox(label="Output"), ]
 
 io = gr.Interface(
     inference,
     inputs,
     outputs,
-    allow_flagging=False, )
+    allow_flagging=False,
+)
 io.launch(debug=False, server_name="0.0.0.0", server_port=8586)
diff --git a/ppdiffusers/examples/clip_interrogator/predict.py b/ppdiffusers/examples/clip_interrogator/predict.py
index d42d5a666a53c..bb6dd5f6004b7 100644
--- a/ppdiffusers/examples/clip_interrogator/predict.py
+++ b/ppdiffusers/examples/clip_interrogator/predict.py
@@ -15,9 +15,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-                               CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
-                               Interrogator)
+from clip_interrogator import (
+    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    Config,
+    Interrogator,
+)
 from cog import BasePredictor, Input, Path
 from PIL import Image
 
@@ -28,29 +31,32 @@ def setup(self):
             Config(
                 blip_pretrained_model_name_or_path="Salesforce/blip-image-captioning-large",
                 clip_pretrained_model_name_or_path="openai/clip-vit-large-patch14",
-                device="gpu", ))
+                device="gpu",
+            )
+        )
 
     def predict(
-            self,
-            image: Path=Input(description="Input image"),
-            clip_pretrained_model_name_or_path: str=Input(
-                default="openai/clip-vit-large-patch14",
-                choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-                description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2",
-            ),
-            blip_pretrained_model_name_or_path: str=Input(
-                default="Salesforce/blip-image-captioning-large",
-                choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-                description="Choose Salesforce/blip-image-captioning-large", ),
-            mode: str=Input(
-                default="best",
-                choices=["best", "classic", "fast"],
-                description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).",
-            ), ) -> str:
+        self,
+        image: Path = Input(description="Input image"),
+        clip_pretrained_model_name_or_path: str = Input(
+            default="openai/clip-vit-large-patch14",
+            choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2",
+        ),
+        blip_pretrained_model_name_or_path: str = Input(
+            default="Salesforce/blip-image-captioning-large",
+            choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            description="Choose Salesforce/blip-image-captioning-large",
+        ),
+        mode: str = Input(
+            default="best",
+            choices=["best", "classic", "fast"],
+            description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).",
+        ),
+    ) -> str:
         """Run a single prediction on the model"""
         image = Image.open(str(image)).convert("RGB")
-        self.switch_model(clip_pretrained_model_name_or_path,
-                          blip_pretrained_model_name_or_path)
+        self.switch_model(clip_pretrained_model_name_or_path, blip_pretrained_model_name_or_path)
         if mode == "best":
             return self.ci.interrogate(image)
         elif mode == "classic":
@@ -59,16 +65,13 @@ def predict(
             return self.ci.interrogate_fast(image)
 
     def switch_model(
-            self,
-            clip_pretrained_model_name_or_path: str,
-            blip_pretrained_model_name_or_path: str, ):
-        if (clip_pretrained_model_name_or_path !=
-                self.ci.config.clip_pretrained_model_name_or_path):
-            self.ci.config.clip_pretrained_model_name_or_path = (
-                clip_pretrained_model_name_or_path)
+        self,
+        clip_pretrained_model_name_or_path: str,
+        blip_pretrained_model_name_or_path: str,
+    ):
+        if clip_pretrained_model_name_or_path != self.ci.config.clip_pretrained_model_name_or_path:
+            self.ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path
             self.ci.load_clip_model()
-        if (blip_pretrained_model_name_or_path !=
-                self.ci.config.blip_pretrained_model_name_or_path):
-            self.ci.config.blip_pretrained_model_name_or_path = (
-                blip_pretrained_model_name_or_path)
+        if blip_pretrained_model_name_or_path != self.ci.config.blip_pretrained_model_name_or_path:
+            self.ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path
             self.ci.load_blip_model()
diff --git a/ppdiffusers/examples/clip_interrogator/run_cli.py b/ppdiffusers/examples/clip_interrogator/run_cli.py
index 081717fcf915d..c905195af03f8 100755
--- a/ppdiffusers/examples/clip_interrogator/run_cli.py
+++ b/ppdiffusers/examples/clip_interrogator/run_cli.py
@@ -21,9 +21,12 @@
 
 import paddle
 import requests
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-                               CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
-                               Interrogator)
+from clip_interrogator import (
+    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    Config,
+    Interrogator,
+)
 from PIL import Image
 
 
@@ -44,18 +47,16 @@ def main():
         "--clip",
         default="openai/clip-vit-large-patch14",
         choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-        help="name of CLIP model to use", )
+        help="name of CLIP model to use",
+    )
     parser.add_argument(
         "-b",
         "--blip",
         default="Salesforce/blip-image-captioning-large",
         choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-        help="name of BLIP model to use", )
-    parser.add_argument(
-        "-d",
-        "--device",
-        default="auto",
-        help="device to use (auto, gpu or cpu)")
+        help="name of BLIP model to use",
+    )
+    parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
     parser.add_argument("-f", "--folder", help="path to folder of images")
     parser.add_argument("-i", "--image", help="image file or url")
     parser.add_argument(
@@ -63,7 +64,8 @@ def main():
         "--mode",
         default="best",
         choices=["best", "classic", "fast"],
-        help="best, classic, or fast", )
+        help="best, classic, or fast",
+    )
 
     args = parser.parse_args()
     if not args.folder and not args.image:
@@ -71,8 +73,7 @@ def main():
         exit(1)
 
     if args.folder is not None and args.image is not None:
-        print(
-            "Specify a folder or batch processing or a single image, not both")
+        print("Specify a folder or batch processing or a single image, not both")
         exit(1)
 
     # validate clip model name
@@ -98,16 +99,15 @@ def main():
     # generate a nice prompt
     config = Config(
         clip_pretrained_model_name_or_path=args.clip,
-        blip_pretrained_model_name_or_path=args.blip, )
+        blip_pretrained_model_name_or_path=args.blip,
+    )
     ci = Interrogator(config)
 
     # process single image
     if args.image is not None:
         image_path = args.image
-        if str(image_path).startswith("http://") or str(image_path).startswith(
-                "https://"):
-            image = Image.open(requests.get(image_path, stream=True)
-                               .raw).convert("RGB")
+        if str(image_path).startswith("http://") or str(image_path).startswith("https://"):
+            image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
         else:
             image = Image.open(image_path).convert("RGB")
         if not image:
@@ -121,10 +121,7 @@ def main():
             print(f"The folder {args.folder} does not exist!")
             exit(1)
 
-        files = [
-            f for f in os.listdir(args.folder)
-            if f.endswith(".jpg") or f.endswith(".png")
-        ]
+        files = [f for f in os.listdir(args.folder) if f.endswith(".jpg") or f.endswith(".png")]
         prompts = []
         for file in files:
             image = Image.open(os.path.join(args.folder, file)).convert("RGB")
@@ -140,9 +137,7 @@ def main():
                 for file, prompt in zip(files, prompts):
                     w.writerow([file, prompt])
 
-            print(
-                f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!"
-            )
+            print(f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!")
 
 
 if __name__ == "__main__":
diff --git a/ppdiffusers/examples/clip_interrogator/run_gradio.py b/ppdiffusers/examples/clip_interrogator/run_gradio.py
index 435c7c46a265b..60c35b66fe030 100755
--- a/ppdiffusers/examples/clip_interrogator/run_gradio.py
+++ b/ppdiffusers/examples/clip_interrogator/run_gradio.py
@@ -19,9 +19,12 @@
 
 import gradio as gr
 import paddle
-from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-                               CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config,
-                               Interrogator)
+from clip_interrogator import (
+    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+    Config,
+    Interrogator,
+)
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -29,19 +32,18 @@
     "--clip",
     default="openai/clip-vit-large-patch14",
     choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    help="name of CLIP model to use", )
+    help="name of CLIP model to use",
+)
 parser.add_argument(
     "-b",
     "--blip",
     default="Salesforce/blip-image-captioning-large",
     choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    help="name of BLIP model to use", )
-parser.add_argument(
-    "-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
-parser.add_argument(
-    "-s", "--share", action="store_true", help="Create a public link")
-parser.add_argument(
-    "--server_name", default="0.0.0.0", type=str, help="server_name")
+    help="name of BLIP model to use",
+)
+parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
+parser.add_argument("-s", "--share", action="store_true", help="Create a public link")
+parser.add_argument("--server_name", default="0.0.0.0", type=str, help="server_name")
 parser.add_argument("--server_port", default=8586, type=int, help="server_port")
 
 args = parser.parse_args()
@@ -69,31 +71,29 @@
 config = Config(
     cache_path="cache",
     clip_pretrained_model_name_or_path=args.clip,
-    blip_pretrained_model_name_or_path=args.blip, )
+    blip_pretrained_model_name_or_path=args.blip,
+)
 ci = Interrogator(config)
 
 
 def inference(
-        image,
-        mode,
-        clip_pretrained_model_name_or_path,
-        blip_pretrained_model_name_or_path,
-        blip_min_length,
-        blip_max_length,
-        blip_sample,
-        blip_top_p,
-        blip_repetition_penalty,
-        blip_num_beams, ):
-    if (clip_pretrained_model_name_or_path !=
-            ci.config.clip_pretrained_model_name_or_path):
-        ci.config.clip_pretrained_model_name_or_path = (
-            clip_pretrained_model_name_or_path)
+    image,
+    mode,
+    clip_pretrained_model_name_or_path,
+    blip_pretrained_model_name_or_path,
+    blip_min_length,
+    blip_max_length,
+    blip_sample,
+    blip_top_p,
+    blip_repetition_penalty,
+    blip_num_beams,
+):
+    if clip_pretrained_model_name_or_path != ci.config.clip_pretrained_model_name_or_path:
+        ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path
         ci.load_clip_model()
 
-    if (blip_pretrained_model_name_or_path !=
-            ci.config.blip_pretrained_model_name_or_path):
-        ci.config.blip_pretrained_model_name_or_path = (
-            blip_pretrained_model_name_or_path)
+    if blip_pretrained_model_name_or_path != ci.config.blip_pretrained_model_name_or_path:
+        ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path
         ci.load_blip_model()
 
     ci.config.blip_min_length = int(blip_min_length)
@@ -114,36 +114,25 @@ def inference(
 
 inputs = [
     gr.inputs.Image(type="pil"),
-    gr.Radio(
-        ["best", "classic", "fast"], label="Mode", value="fast"),
-    gr.Dropdown(
-        CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip,
-        label="CLIP Model"),
-    gr.Dropdown(
-        BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip,
-        label="BLIP Model"),
-    gr.Number(
-        value=8, label="Caption min Length"),
-    gr.Number(
-        value=32, label="Caption Max Length"),
-    gr.Radio(
-        ["True", "False"], value="False", label="Sample or not?"),
-    gr.Number(
-        value=0.9, label="TopP value, when Sample is true"),
-    gr.Number(
-        value=1.1, label="Repetition penalty value, when Sample is false"),
-    gr.Number(
-        value=64, label="Caption Num Beams, when Sample is false"),
+    gr.Radio(["best", "classic", "fast"], label="Mode", value="fast"),
+    gr.Dropdown(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip, label="CLIP Model"),
+    gr.Dropdown(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip, label="BLIP Model"),
+    gr.Number(value=8, label="Caption min Length"),
+    gr.Number(value=32, label="Caption Max Length"),
+    gr.Radio(["True", "False"], value="False", label="Sample or not?"),
+    gr.Number(value=0.9, label="TopP value, when Sample is true"),
+    gr.Number(value=1.1, label="Repetition penalty value, when Sample is false"),
+    gr.Number(value=64, label="Caption Num Beams, when Sample is false"),
+]
+outputs = [
+    gr.outputs.Textbox(label="Image Caption Output"),
 ]
-outputs = [gr.outputs.Textbox(label="Image Caption Output"), ]
 
 io = gr.Interface(
     inference,
     inputs,
     outputs,
     title="🕵️‍♂️ Paddle CLIP Interrogator 🕵️‍♂️",
-    allow_flagging=False, )
-io.launch(
-    share=args.share,
-    server_name=args.server_name,
-    server_port=args.server_port)
+    allow_flagging=False,
+)
+io.launch(share=args.share, server_name=args.server_name, server_port=args.server_port)
diff --git a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
index 3ef59efaf907f..f4495bba5b6f4 100644
--- a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -20,18 +20,30 @@
 import paddle.nn.functional as F
 import PIL
 from einops import rearrange
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPModel,
-                                    CLIPTextModel, CLIPTokenizer)
+from paddlenlp.transformers import (
+    CLIPFeatureExtractor,
+    CLIPModel,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
 from tqdm import tqdm
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline,
-                         DPMSolverMultistepScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.loaders import FromCkptMixin
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipelineOutput,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.utils import PIL_INTERPOLATION, logging, randn_tensor
 
 logger = logging.get_logger(__name__)
@@ -43,11 +55,7 @@ def preprocess(image, w, h):
     elif isinstance(image, PIL.Image.Image):
         image = [image]
     if isinstance(image[0], PIL.Image.Image):
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -82,11 +90,12 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
 def spherical_dist_loss(x, y):
     x = F.normalize(x=x, axis=-1)
     y = F.normalize(x=y, axis=-1)
-    return (paddle.divide(
-        (x - y).norm(axis=-1), paddle.to_tensor(
-            2, dtype=x.dtype)).asin().pow(y=paddle.to_tensor(
-                2, dtype=x.dtype)).multiply(y=paddle.to_tensor(
-                    2, dtype=x.dtype)))
+    return (
+        paddle.divide((x - y).norm(axis=-1), paddle.to_tensor(2, dtype=x.dtype))
+        .asin()
+        .pow(y=paddle.to_tensor(2, dtype=x.dtype))
+        .multiply(y=paddle.to_tensor(2, dtype=x.dtype))
+    )
 
 
 def set_requires_grad(model, value):
@@ -97,20 +106,25 @@ def set_requires_grad(model, value):
 class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, FromCkptMixin):
     # _optional_components = ["safety_checker", "feature_extractor"]
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            clip_model: CLIPModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler,
-                             DPMSolverMultistepScheduler, ],
-            feature_extractor: CLIPFeatureExtractor,
-            safety_checker: StableDiffusionSafetyChecker,
-            blip_model=None,
-            blip_processor=None,
-            clip_interrogator=None,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            DDIMScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPFeatureExtractor,
+        safety_checker: StableDiffusionSafetyChecker,
+        blip_model=None,
+        blip_processor=None,
+        clip_interrogator=None,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -139,18 +153,21 @@ def __init__(
             blip_model=blip_model,
             blip_processor=blip_processor,
             clip_interrogator=clip_interrogator,
-            safety_checker=safety_checker, )
+            safety_checker=safety_checker,
+        )
         self.feature_extractor_size = (
-            feature_extractor.size if isinstance(feature_extractor.size, int)
-            else feature_extractor.size["shortest_edge"])
+            feature_extractor.size
+            if isinstance(feature_extractor.size, int)
+            else feature_extractor.size["shortest_edge"]
+        )
         self.normalize = paddle.vision.transforms.Normalize(
-            mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+            mean=feature_extractor.image_mean, std=feature_extractor.image_std
+        )
         set_requires_grad(self.text_encoder, False)
         set_requires_grad(self.clip_model, False)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         if slice_size == "auto":
             slice_size = self.unet.config.attention_head_dim // 2
         self.unet.set_attention_slice(slice_size)
@@ -171,46 +188,35 @@ def unfreeze_unet(self):
         set_requires_grad(self.unet, True)
 
     def get_timesteps(self, num_inference_steps, strength):
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
         return timesteps, num_inference_steps - t_start
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
 
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        dtype,
-                        generator=None):
+    def prepare_latents(self, image, timestep, batch_size, dtype, generator=None):
         if not isinstance(image, paddle.Tensor):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor` but is {type(image)}")
+            raise ValueError(f"`image` has to be of type `torch.Tensor` but is {type(image)}")
         image = image.cast(dtype)
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(x=init_latents, axis=0)
         else:
             init_latents = self.vae.encode(image).latent_dist.sample(generator)
         init_latents = 0.18215 * init_latents
-        init_latents = init_latents.repeat_interleave(
-            repeats=batch_size, axis=0)
-        noise = randn_tensor(
-            init_latents.shape, generator=generator, dtype=dtype)
+        init_latents = init_latents.repeat_interleave(repeats=batch_size, axis=0)
+        noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype)
 
         # get latents
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
@@ -223,63 +229,53 @@ def get_image_description(self, image):
         else:
             # with paddle.no_grad(), paddle.amp.auto_cast():
             inputs = self.blip_processor(images=image, return_tensors="pd")
-            inputs["pixel_values"] = inputs["pixel_values"].cast(
-                self.blip_model.dtype)
+            inputs["pixel_values"] = inputs["pixel_values"].cast(self.blip_model.dtype)
             # out = self.blip_model.generate(**inputs, decode_strategy="beam_search", num_beams=2, length_penalty=0, max_length=5)
             out = self.blip_model.generate(**inputs)
-            return self.blip_processor.decode(
-                out[0][0], skip_special_tokens=True)
+            return self.blip_processor.decode(out[0][0], skip_special_tokens=True)
 
     def get_clip_image_embeddings(self, image, batch_size):
         clip_image_input = self.feature_extractor.preprocess(image)
         clip_image_features = (
-            paddle.to_tensor(data=clip_image_input["pixel_values"][0])
-            .unsqueeze(axis=0).astype(dtype="float16"))
-        image_embeddings_clip = self.clip_model.get_image_features(
-            clip_image_features)
-        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(
-            p=2, axis=-1, keepdim=True)
-        image_embeddings_clip = image_embeddings_clip.repeat_interleave(
-            repeats=batch_size, axis=0)
+            paddle.to_tensor(data=clip_image_input["pixel_values"][0]).unsqueeze(axis=0).astype(dtype="float16")
+        )
+        image_embeddings_clip = self.clip_model.get_image_features(clip_image_features)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
+        image_embeddings_clip = image_embeddings_clip.repeat_interleave(repeats=batch_size, axis=0)
         return image_embeddings_clip
 
     @paddle.enable_grad()
     def cond_fn(
-            self,
-            latents,
-            timestep,
-            index,
-            text_embeddings,
-            noise_pred_original,
-            original_image_embeddings_clip,
-            clip_guidance_scale, ):
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        original_image_embeddings_clip,
+        clip_guidance_scale,
+    ):
         out_0 = latents.detach()
         out_0.stop_gradient = not True
         latents = out_0
         latent_model_input = self.scheduler.scale_model_input(latents, timestep)
 
         # predict the noise residual
-        noise_pred = self.unet(
-            latent_model_input, timestep,
-            encoder_hidden_states=text_embeddings).sample
-        if isinstance(
-                self.scheduler,
-            (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
+        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
             alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
             beta_prod_t = 1 - alpha_prod_t
 
             # compute predicted original sample from predicted noise also called
             # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-            pred_original_sample = (
-                latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5
+            pred_original_sample = (latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5
             fac = paddle.sqrt(x=beta_prod_t)
             sample = pred_original_sample * fac + latents * (1 - fac)
         elif isinstance(self.scheduler, LMSDiscreteScheduler):
             sigma = self.scheduler.sigmas[index]
             sample = latents - sigma * noise_pred
         else:
-            raise ValueError(
-                f"scheduler type {type(self.scheduler)} not supported")
+            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
 
         # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
         sample = 1 / 0.18215 * sample
@@ -289,56 +285,48 @@ def cond_fn(
         # image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image)
         c_size = image.shape[0]
         image = rearrange(image, "c t h w -> (c t) h w")
-        image = paddle.vision.transforms.Resize(self.feature_extractor_size)(
-            image)
+        image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image)
         image = rearrange(image, "(c t) h w -> c t h w", c=c_size)
 
         image = self.normalize(image)
         image_embeddings_clip = self.clip_model.get_image_features(image)
-        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(
-            p=2, axis=-1, keepdim=True)
-        loss = (spherical_dist_loss(image_embeddings_clip,
-                                    original_image_embeddings_clip).mean() *
-                clip_guidance_scale)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
+        loss = spherical_dist_loss(image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale
         grads = -paddle.autograd.grad(loss, latents)[0]
         if isinstance(self.scheduler, LMSDiscreteScheduler):
             latents = latents.detach() + grads * sigma**2
             noise_pred = noise_pred_original
         else:
-            noise_pred = noise_pred_original - paddle.sqrt(
-                x=beta_prod_t) * grads
+            noise_pred = noise_pred_original - paddle.sqrt(x=beta_prod_t) * grads
         return noise_pred, latents
 
     @paddle.no_grad()
     def __call__(
-            self,
-            style_image: Union[paddle.Tensor, PIL.Image.Image],
-            content_image: Union[paddle.Tensor, PIL.Image.Image],
-            style_prompt: Optional[str]=None,
-            content_prompt: Optional[str]=None,
-            negative_prompt=None,
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            noise_strength: float=0.6,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            batch_size: Optional[int]=1,
-            eta: float=0.0,
-            clip_guidance_scale: Optional[float]=100,
-            generator: Optional[paddle.Generator]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            slerp_latent_style_strength: float=0.8,
-            slerp_prompt_style_strength: float=0.1,
-            slerp_clip_image_style_strength: float=0.1, ):
+        self,
+        style_image: Union[paddle.Tensor, PIL.Image.Image],
+        content_image: Union[paddle.Tensor, PIL.Image.Image],
+        style_prompt: Optional[str] = None,
+        content_prompt: Optional[str] = None,
+        negative_prompt=None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        noise_strength: float = 0.6,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        batch_size: Optional[int] = 1,
+        eta: float = 0.0,
+        clip_guidance_scale: Optional[float] = 100,
+        generator: Optional[paddle.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        slerp_latent_style_strength: float = 0.8,
+        slerp_prompt_style_strength: float = 0.1,
+        slerp_clip_image_style_strength: float = 0.1,
+    ):
         if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed {batch_size} batch_size, but only {len(generator)} generators."
-            )
+            raise ValueError(f"You have passed {batch_size} batch_size, but only {len(generator)} generators.")
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         # generate prompts with blip model if prompt is
         if content_prompt is None:
@@ -353,35 +341,32 @@ def __call__(
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
-        content_text_embeddings = self.text_encoder(
-            content_text_input.input_ids)[0]
+            return_tensors="pd",
+        )
+        content_text_embeddings = self.text_encoder(content_text_input.input_ids)[0]
         style_text_input = self.tokenizer(
             style_prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         style_text_embeddings = self.text_encoder(style_text_input.input_ids)[0]
 
-        text_embeddings = slerp(slerp_prompt_style_strength,
-                                content_text_embeddings, style_text_embeddings)
+        text_embeddings = slerp(slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings)
 
         # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(
-            repeats=batch_size, axis=0)
+        text_embeddings = text_embeddings.repeat_interleave(repeats=batch_size, axis=0)
 
         # set timesteps
-        accepts_offset = "offset" in set(
-            inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
         extra_set_kwargs = {}
         if accepts_offset:
             extra_set_kwargs["offset"] = 1
         self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
         # Some schedulers like PNDM have timesteps as arrays
         # It's more optimized to move all timesteps to correct device beforehand
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            noise_strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, noise_strength)
         latent_timestep = timesteps[:1].tile(repeat_times=[batch_size])
 
         # Preprocess image
@@ -391,25 +376,25 @@ def __call__(
             latent_timestep,
             batch_size,
             text_embeddings.dtype,
-            generator, )
+            generator,
+        )
         preprocessed_style_image = preprocess(style_image, width, height)
         style_latents = self.prepare_latents(
             preprocessed_style_image,
             latent_timestep,
             batch_size,
             text_embeddings.dtype,
-            generator, )
-        latents = slerp(slerp_latent_style_strength, content_latents,
-                        style_latents)
+            generator,
+        )
+        latents = slerp(slerp_latent_style_strength, content_latents, style_latents)
         if clip_guidance_scale > 0:
-            content_clip_image_embedding = self.get_clip_image_embeddings(
-                content_image, batch_size)
-            style_clip_image_embedding = self.get_clip_image_embeddings(
-                style_image, batch_size)
+            content_clip_image_embedding = self.get_clip_image_embeddings(content_image, batch_size)
+            style_clip_image_embedding = self.get_clip_image_embeddings(style_image, batch_size)
             clip_image_embeddings = slerp(
                 slerp_clip_image_style_strength,
                 content_clip_image_embedding,
-                style_clip_image_embedding, )
+                style_clip_image_embedding,
+            )
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -426,17 +411,16 @@ def __call__(
                 uncond_tokens,
                 padding="max_length",
                 max_length=max_length,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
             # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = uncond_embeddings.repeat_interleave(
-                repeats=batch_size, axis=0)
+            uncond_embeddings = uncond_embeddings.repeat_interleave(repeats=batch_size, axis=0)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            text_embeddings = paddle.concat(
-                x=[uncond_embeddings, text_embeddings])
+            text_embeddings = paddle.concat(x=[uncond_embeddings, text_embeddings])
 
         # get the initial random noise unless the user supplied it
 
@@ -451,13 +435,10 @@ def __call__(
         ]
         latents_dtype = text_embeddings.dtype
         if latents is None:
-            latents = paddle.randn(
-                shape=latents_shape, generator=generator, dtype=latents_dtype)
+            latents = paddle.randn(shape=latents_shape, generator=generator, dtype=latents_dtype)
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
@@ -466,41 +447,34 @@ def __call__(
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         # with self.progress_bar(total=num_inference_steps):
         for i, t in tqdm(enumerate(timesteps)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat(x=[latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input, t,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform classifier free guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # perform clip guidance
             if clip_guidance_scale > 0:
                 text_embeddings_for_guidance = (
-                    text_embeddings.chunk(chunks=2)[1]
-                    if do_classifier_free_guidance else text_embeddings)
+                    text_embeddings.chunk(chunks=2)[1] if do_classifier_free_guidance else text_embeddings
+                )
                 noise_pred, latents = self.cond_fn(
                     latents,
                     t,
@@ -508,23 +482,21 @@ def __call__(
                     text_embeddings_for_guidance,
                     noise_pred,
                     clip_image_embeddings,
-                    clip_guidance_scale, )
+                    clip_guidance_scale,
+                )
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
         # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
         latents = 1 / 0.18215 * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clip(min=0, max=1)
         image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          text_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
         if not return_dict:
             return image, None
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
index ee8e0cac04537..f23f5d60b2eee 100644
--- a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
+++ b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
@@ -20,14 +20,22 @@
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.vision import transforms
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPModel,
-                                    CLIPTextModel, CLIPTokenizer)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline,
-                         LMSDiscreteScheduler, PNDMScheduler,
-                         UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
+from paddlenlp.transformers import (
+    CLIPFeatureExtractor,
+    CLIPModel,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from ppdiffusers.utils import logging
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -46,13 +54,10 @@ def forward(self, pixel_values, num_cutouts):
         min_size = min(sideX, sideY, self.cut_size)
         cutouts = []
         for _ in range(num_cutouts):
-            size = int(
-                paddle.rand((1, ))**self.cut_power * (max_size - min_size) +
-                min_size)
-            offsetx = int(paddle.randint(0, sideX - size + 1, (1, )))
-            offsety = int(paddle.randint(0, sideY - size + 1, (1, )))
-            cutout = pixel_values[:, :, offsety:offsety + size, offsetx:offsetx
-                                  + size]
+            size = int(paddle.rand((1,)) ** self.cut_power * (max_size - min_size) + min_size)
+            offsetx = int(paddle.randint(0, sideX - size + 1, (1,)))
+            offsety = int(paddle.randint(0, sideY - size + 1, (1,)))
+            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
             cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
         return paddle.concat(cutouts)
 
@@ -75,15 +80,15 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            clip_model: CLIPModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[PNDMScheduler, LMSDiscreteScheduler,
-                             DDIMScheduler],
-            feature_extractor: CLIPFeatureExtractor, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
+        feature_extractor: CLIPFeatureExtractor,
+    ):
         super().__init__()
         self.register_modules(
             vae=vae,
@@ -92,20 +97,21 @@ def __init__(
             tokenizer=tokenizer,
             unet=unet,
             scheduler=scheduler,
-            feature_extractor=feature_extractor, )
-
-        self.normalize = transforms.Normalize(
-            mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        self.cut_out_size = (feature_extractor.size
-                             if isinstance(feature_extractor.size, int) else
-                             feature_extractor.size["shortest_edge"])
+            feature_extractor=feature_extractor,
+        )
+
+        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        self.cut_out_size = (
+            feature_extractor.size
+            if isinstance(feature_extractor.size, int)
+            else feature_extractor.size["shortest_edge"]
+        )
         self.make_cutouts = MakeCutouts(self.cut_out_size)
 
         set_stop_gradient(self.text_encoder, True)
         set_stop_gradient(self.clip_model, True)
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         if slice_size == "auto":
             # half the attention head size is usually a good trade-off between
             # speed and memory
@@ -128,16 +134,17 @@ def unfreeze_unet(self):
         set_stop_gradient(self.unet, False)
 
     def cond_fn(
-            self,
-            latents,
-            timestep,
-            index,
-            text_embeddings,
-            noise_pred_original,
-            text_embeddings_clip,
-            clip_guidance_scale,
-            num_cutouts,
-            use_cutouts=True, ):
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        text_embeddings_clip,
+        clip_guidance_scale,
+        num_cutouts,
+        use_cutouts=True,
+    ):
         # https://github.com/PaddlePaddle/Paddle/issues/54306  in 2.5rc paddle.set_grad_enabled has bug
         with paddle.set_grad_enabled(True):
             latents = latents.detach()
@@ -146,24 +153,19 @@ def cond_fn(
             if isinstance(self.scheduler, LMSDiscreteScheduler):
                 sigma = self.scheduler.sigmas[index]
                 # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-                latent_model_input = latents / ((sigma**2 + 1)**0.5)
+                latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
             else:
                 latent_model_input = latents
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
 
             if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)):
                 alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
                 beta_prod_t = 1 - alpha_prod_t
                 # compute predicted original sample from predicted noise also called
                 # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-                pred_original_sample = (
-                    latents - beta_prod_t**
-                    (0.5) * noise_pred) / alpha_prod_t**(0.5)
+                pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
 
                 fac = paddle.sqrt(beta_prod_t)
                 sample = pred_original_sample * (fac) + latents * (1 - fac)
@@ -171,8 +173,7 @@ def cond_fn(
                 sigma = self.scheduler.sigmas[index]
                 sample = latents - sigma * noise_pred
             else:
-                raise ValueError(
-                    f"scheduler type {type(self.scheduler)} not supported")
+                raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
 
             sample = 1 / 0.18215 * sample
             image = self.vae.decode(sample).sample
@@ -182,23 +183,18 @@ def cond_fn(
                 image = self.make_cutouts(image, num_cutouts)
             else:
                 resize_transform = transforms.Resize(self.cut_out_size)
-                image = paddle.stack(
-                    [resize_transform(img) for img in image], axis=0)
+                image = paddle.stack([resize_transform(img) for img in image], axis=0)
             image = self.normalize(image).astype(latents.dtype)
 
             image_embeddings_clip = self.clip_model.get_image_features(image)
-            image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(
-                p=2, axis=-1, keepdim=True)
+            image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
 
             if use_cutouts:
-                dists = spherical_dist_loss(image_embeddings_clip,
-                                            text_embeddings_clip)
+                dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
                 dists = dists.reshape([num_cutouts, sample.shape[0], -1])
                 loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
             else:
-                loss = (spherical_dist_loss(image_embeddings_clip,
-                                            text_embeddings_clip).mean() *
-                        clip_guidance_scale)
+                loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
 
             grads = -paddle.autograd.grad(loss, latents)[0]
 
@@ -206,52 +202,49 @@ def cond_fn(
                 latents = latents.detach() + grads * (sigma**2)
                 noise_pred = noise_pred_original
             else:
-                noise_pred = noise_pred_original - paddle.sqrt(
-                    beta_prod_t) * grads
+                noise_pred = noise_pred_original - paddle.sqrt(beta_prod_t) * grads
             return noise_pred, latents
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            clip_guidance_scale: Optional[float]=100,
-            clip_prompt: Optional[Union[str, List[str]]]=None,
-            num_cutouts: Optional[int]=4,
-            use_cutouts: Optional[bool]=True,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        clip_guidance_scale: Optional[float] = 100,
+        clip_prompt: Optional[Union[str, List[str]]] = None,
+        num_cutouts: Optional[int] = 4,
+        use_cutouts: Optional[bool] = True,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
             batch_size = len(prompt)
         else:
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         # get prompt text embeddings
         text_inputs = self.tokenizer(
@@ -259,26 +252,25 @@ def __call__(
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
 
         if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(
-                text_input_ids[:, self.tokenizer.model_max_length:])
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
 
         attention_mask = paddle.ones_like(text_input_ids)
-        text_embeddings = self.text_encoder(
-            text_input_ids, attention_mask=attention_mask)[0]
+        text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
 
         # duplicate text embeddings for each generation per prompt
         bs_embed, seq_len, _ = text_embeddings.shape
         text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if clip_guidance_scale > 0:
             if clip_prompt is not None:
@@ -287,19 +279,16 @@ def __call__(
                     padding="max_length",
                     max_length=self.tokenizer.model_max_length,
                     truncation=True,
-                    return_tensors="pd", ).input_ids
+                    return_tensors="pd",
+                ).input_ids
             else:
                 clip_text_input_ids = text_inputs.input_ids
-            text_embeddings_clip = self.clip_model.get_text_features(
-                clip_text_input_ids)
-            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(
-                p=2, axis=-1, keepdim=True)
+            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input_ids)
+            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
             # duplicate text embeddings clip for each generation per prompt
             bs_embed, _ = text_embeddings_clip.shape
-            text_embeddings_clip = text_embeddings_clip.tile(
-                [1, num_images_per_prompt])
-            text_embeddings_clip = text_embeddings_clip.reshape(
-                [bs_embed * num_images_per_prompt, -1])
+            text_embeddings_clip = text_embeddings_clip.tile([1, num_images_per_prompt])
+            text_embeddings_clip = text_embeddings_clip.reshape([bs_embed * num_images_per_prompt, -1])
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -313,14 +302,16 @@ def __call__(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -330,23 +321,20 @@ def __call__(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = paddle.ones_like(uncond_input.input_ids)
-            uncond_embeddings = self.text_encoder(
-                uncond_input.input_ids, attention_mask=attention_mask)[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
 
             # duplicate unconditional embeddings for each generation per prompt
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile(
-                [batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            text_embeddings = paddle.concat(
-                [uncond_embeddings, text_embeddings])
+            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
 
         # get the initial random noise unless the user supplied it
 
@@ -360,13 +348,10 @@ def __call__(
             width // 8,
         ]
         if latents is None:
-            latents = paddle.randn(
-                latents_shape, generator=generator, dtype=text_embeddings.dtype)
+            latents = paddle.randn(latents_shape, generator=generator, dtype=text_embeddings.dtype)
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -382,41 +367,34 @@ def __call__(
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input, t,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform classifier free guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # perform clip guidance
             if clip_guidance_scale > 0:
-                text_embeddings_for_guidance = (text_embeddings.chunk(2)[1]
-                                                if do_classifier_free_guidance
-                                                else text_embeddings)
+                text_embeddings_for_guidance = (
+                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+                )
                 noise_pred, latents = self.cond_fn(
                     latents,
                     t,
@@ -426,11 +404,11 @@ def __call__(
                     text_embeddings_clip,
                     clip_guidance_scale,
                     num_cutouts,
-                    use_cutouts, )
+                    use_cutouts,
+                )
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -449,5 +427,4 @@ def __call__(
         if not return_dict:
             return (image, None)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=None)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/ppdiffusers/examples/community/composable_stable_diffusion.py b/ppdiffusers/examples/community/composable_stable_diffusion.py
index f3ff012a945f0..74e7f3856fdb6 100644
--- a/ppdiffusers/examples/community/composable_stable_diffusion.py
+++ b/ppdiffusers/examples/community/composable_stable_diffusion.py
@@ -16,18 +16,16 @@
 from typing import Callable, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
-                                    PNDMScheduler)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ppdiffusers.utils import deprecate, logging
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -62,30 +60,26 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
         super().__init__()
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
@@ -107,10 +101,10 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
 
@@ -139,24 +133,25 @@ def disable_attention_slicing(self):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: str,
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: str=None,
-            # num_images_per_prompt: Optional[int] = 1,
-            eta: Optional[float]=0.0,
-            seed: Optional[int]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            weights: Optional[str]="",
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            reduce_memory: Optional[bool]=True,
-            **kwargs, ):
+        self,
+        prompt: str,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: str = None,
+        # num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        seed: Optional[int] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        weights: Optional[str] = "",
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        reduce_memory: Optional[bool] = True,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -215,24 +210,20 @@ def __call__(
         if isinstance(prompt, str):
             batch_size = 1
         else:
-            raise ValueError(
-                f"`prompt` has to be of type `str`but is {type(prompt)}")
+            raise ValueError(f"`prompt` has to be of type `str`but is {type(prompt)}")
         if negative_prompt is not None and not isinstance(negative_prompt, str):
-            raise ValueError(
-                f"`negative_prompt` has to be of type `str`but is {type(prompt)}"
-            )
+            raise ValueError(f"`negative_prompt` has to be of type `str`but is {type(prompt)}")
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if "|" in prompt:
             prompt = [x.strip() for x in prompt.split("|")]
@@ -244,19 +235,19 @@ def __call__(
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
 
         if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(
-                text_input_ids[:, self.tokenizer.model_max_length:])
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         attention_mask = paddle.ones_like(text_input_ids)
-        text_embeddings = self.text_encoder(
-            text_input_ids, attention_mask=attention_mask)[0]
+        text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         # bs_embed, seq_len, _ = text_embeddings.shape
@@ -268,20 +259,17 @@ def __call__(
             # specify weights for prompts (excluding the unconditional score)
             print("using equal weights for all prompts...")
             pos_weights = paddle.to_tensor(
-                [1 / (text_embeddings.shape[0] - 1)] *
-                (text_embeddings.shape[0] - 1)).reshape([-1, 1, 1, 1])
+                [1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1)
+            ).reshape([-1, 1, 1, 1])
             neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1])
-            mask = paddle.to_tensor(
-                [False] + [True] * pos_weights.shape[0], dtype=paddle.bool)
+            mask = paddle.to_tensor([False] + [True] * pos_weights.shape[0], dtype=paddle.bool)
         else:
             # set prompt weight for each
             num_prompts = len(prompt) if isinstance(prompt, list) else 1
             weights = [float(w.strip()) for w in weights.split("|")]
             if len(weights) < num_prompts:
                 weights.append(1.0)
-            assert (
-                len(weights) == text_embeddings.shape[0]
-            ), "weights specified are not equal to the number of prompts"
+            assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts"
             pos_weights = []
             neg_weights = []
             mask = []  # first one is unconditional score
@@ -296,8 +284,7 @@ def __call__(
             pos_weights = paddle.to_tensor(pos_weights).reshape([-1, 1, 1, 1])
             pos_weights = pos_weights / pos_weights.sum()
             if neg_weights:
-                neg_weights = paddle.to_tensor(neg_weights).reshape(
-                    [-1, 1, 1, 1])
+                neg_weights = paddle.to_tensor(neg_weights).reshape([-1, 1, 1, 1])
                 neg_weights = neg_weights / neg_weights.sum()
             mask = paddle.to_tensor(mask, dtype=paddle.bool)
 
@@ -320,10 +307,10 @@ def __call__(
                     padding="max_length",
                     max_length=max_length,
                     truncation=True,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
                 attention_mask = paddle.ones_like(uncond_input.input_ids)
-                uncond_embeddings = self.text_encoder(
-                    uncond_input.input_ids, attention_mask=attention_mask)[0]
+                uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
 
                 # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
                 # seq_len = uncond_embeddings.shape[1]
@@ -335,31 +322,25 @@ def __call__(
                 # For classifier free guidance, we need to do two forward passes.
                 # Here we concatenate the unconditional and text embeddings into a single batch
                 # to avoid doing two forward passes
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings])
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
 
                 # update negative weights
                 neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1])
-                mask = paddle.to_tensor(
-                    [False] + mask.tolist(), dtype=paddle.bool)
+                mask = paddle.to_tensor([False] + mask.tolist(), dtype=paddle.bool)
 
         # get the initial random noise unless the user supplied it
 
         # Unlike in other pipelines, latents need to be generated in the target device
         # for 1-to-1 results reproducibility with the CompVis implementation.
         # However this currently doesn't work in `mps`.
-        latents_shape = [
-            batch_size, self.unet.in_channels, height // 8, width // 8
-        ]
+        latents_shape = [batch_size, self.unet.in_channels, height // 8, width // 8]
         if latents is None:
             if seed is not None:
                 paddle.seed(seed)
             latents = paddle.randn(latents_shape, dtype=text_embeddings.dtype)
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -375,8 +356,7 @@ def __call__(
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
@@ -384,47 +364,34 @@ def __call__(
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = (
-                paddle.concat([latents] * text_embeddings.shape[0])
-                if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+                paddle.concat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             if reduce_memory:
                 # reduce memory by predicting each score sequentially
                 noise_preds = []
                 # predict the noise residual
                 for latent_in, text_embedding_in in zip(
-                        latent_model_input.chunk(
-                            latent_model_input.shape[0], axis=0),
-                        text_embeddings.chunk(
-                            text_embeddings.shape[0], axis=0), ):
-                    noise_preds.append(
-                        self.unet(
-                            latent_in,
-                            t,
-                            encoder_hidden_states=text_embedding_in).sample)
+                    latent_model_input.chunk(latent_model_input.shape[0], axis=0),
+                    text_embeddings.chunk(text_embeddings.shape[0], axis=0),
+                ):
+                    noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample)
                 noise_preds = paddle.concat(noise_preds, axis=0)
             else:
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 mask_index = paddle.nonzero(mask).reshape([-1])
                 non_mask_index = paddle.nonzero(~mask).reshape([-1])
-                noise_pred_uncond = (noise_preds[non_mask_index] *
-                                     neg_weights).sum(axis=0, keepdim=True)
-                noise_pred_text = (noise_preds[mask_index] * pos_weights).sum(
-                    axis=0, keepdim=True)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred_uncond = (noise_preds[non_mask_index] * neg_weights).sum(axis=0, keepdim=True)
+                noise_pred_text = (noise_preds[mask_index] * pos_weights).sum(axis=0, keepdim=True)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -441,12 +408,11 @@ def __call__(
 
         # run safety checker
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.astype(
-                    text_embeddings.dtype), )
+                clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype),
+            )
         else:
             has_nsfw_concept = None
 
@@ -456,5 +422,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
index 1a244474fba03..87cee7e93a914 100644
--- a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
+++ b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
@@ -33,15 +33,14 @@ def image_grid(imgs, rows, cols):
 
 
 def create_clip_guided_pipeline(
-        model_id="CompVis/stable-diffusion-v1-4",
-        clip_model_id="openai/clip-vit-large-patch14",
-        scheduler="plms", ):
-    pipeline = StableDiffusionPipeline.from_pretrained(
-        model_id, paddle_dtype=paddle.float16)
+    model_id="CompVis/stable-diffusion-v1-4",
+    clip_model_id="openai/clip-vit-large-patch14",
+    scheduler="plms",
+):
+    pipeline = StableDiffusionPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
 
     if scheduler == "lms":
-        scheduler = LMSDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
     else:
         scheduler = pipeline.scheduler
 
@@ -55,26 +54,28 @@ def create_clip_guided_pipeline(
         text_encoder=pipeline.text_encoder,
         scheduler=scheduler,
         clip_model=clip_model,
-        feature_extractor=feature_extractor, )
+        feature_extractor=feature_extractor,
+    )
 
     return guided_pipeline
 
 
 def infer(
-        prompt,
-        clip_prompt,
-        num_return_images=1,
-        num_images_per_prompt=1,
-        num_inference_steps=50,
-        clip_guidance_scale=100,
-        guidance_scale=7.5,
-        guided_pipeline=None,
-        negative_prompt="",
-        use_cutouts=True,
-        num_cutouts=4,
-        seed=None,
-        unfreeze_unet=True,
-        unfreeze_vae=True, ):
+    prompt,
+    clip_prompt,
+    num_return_images=1,
+    num_images_per_prompt=1,
+    num_inference_steps=50,
+    clip_guidance_scale=100,
+    guidance_scale=7.5,
+    guided_pipeline=None,
+    negative_prompt="",
+    use_cutouts=True,
+    num_cutouts=4,
+    seed=None,
+    unfreeze_unet=True,
+    unfreeze_vae=True,
+):
     clip_prompt = clip_prompt if clip_prompt.strip() != "" else None
     if unfreeze_unet:
         guided_pipeline.unfreeze_unet()
@@ -98,7 +99,8 @@ def infer(
             num_cutouts=num_cutouts,
             use_cutouts=use_cutouts,
             seed=seed,
-            num_images_per_prompt=num_images_per_prompt, ).images
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
         images.extend(image)
 
     return image_grid(images, 1, len(images))
@@ -141,6 +143,7 @@ def infer(
             num_cutouts=num_cutouts,
             seed=seed,
             unfreeze_unet=unfreeze_unet,
-            unfreeze_vae=unfreeze_vae, )
+            unfreeze_vae=unfreeze_vae,
+        )
 
     display(grid_image)
diff --git a/ppdiffusers/examples/community/interpolate_stable_diffusion.py b/ppdiffusers/examples/community/interpolate_stable_diffusion.py
index 82ed3fbc72ad5..d826aad5ac9fb 100644
--- a/ppdiffusers/examples/community/interpolate_stable_diffusion.py
+++ b/ppdiffusers/examples/community/interpolate_stable_diffusion.py
@@ -20,18 +20,16 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
-                                    PNDMScheduler)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ppdiffusers.utils import deprecate, logging
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -90,31 +88,27 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
@@ -136,10 +130,10 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
         When this option is enabled, the attention module will split the input tensor in slices, to compute attention
@@ -166,23 +160,24 @@ def disable_attention_slicing(self):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Optional[Union[str, List[str]]]=None,
-            height: int=512,
-            width: int=512,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            text_embeddings: Optional[paddle.Tensor]=None,
-            **kwargs, ):
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        text_embeddings: Optional[paddle.Tensor] = None,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
@@ -240,16 +235,15 @@ def __call__(
         """
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if text_embeddings is None:
             if isinstance(prompt, str):
@@ -257,37 +251,33 @@ def __call__(
             elif isinstance(prompt, list):
                 batch_size = len(prompt)
             else:
-                raise ValueError(
-                    f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-                )
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
             # get prompt text embeddings
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
 
             if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-                removed_text = self.tokenizer.batch_decode(
-                    text_input_ids[:, self.tokenizer.model_max_length:])
+                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
                 print(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-                text_input_ids = text_input_ids[:, :
-                                                self.tokenizer.model_max_length]
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
             attention_mask = paddle.ones_like(text_input_ids)
-            text_embeddings = self.text_encoder(
-                text_input_ids, attention_mask=attention_mask)[0]
+            text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
         else:
             batch_size = text_embeddings.shape[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
         text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -301,14 +291,16 @@ def __call__(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -318,23 +310,20 @@ def __call__(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = paddle.ones_like(uncond_input.input_ids)
-            uncond_embeddings = self.text_encoder(
-                uncond_input.input_ids, attention_mask=attention_mask)[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile(
-                [batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            text_embeddings = paddle.concat(
-                [uncond_embeddings, text_embeddings])
+            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
 
         # get the initial random noise unless the user supplied it
 
@@ -349,13 +338,10 @@ def __call__(
         ]
         latents_dtype = text_embeddings.dtype
         if latents is None:
-            latents = paddle.randn(
-                latents_shape, generator=generator, dtype=latents_dtype)
+            latents = paddle.randn(latents_shape, generator=generator, dtype=latents_dtype)
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
             latents = latents
 
         # set timesteps
@@ -372,33 +358,26 @@ def __call__(
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input, t,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -413,12 +392,11 @@ def __call__(
         image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.astype(
-                    text_embeddings.dtype), )
+                clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype),
+            )
         else:
             has_nsfw_concept = None
 
@@ -428,8 +406,7 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     def embed_text(self, text):
         """takes in text and turns it into text embeddings"""
@@ -438,7 +415,8 @@ def embed_text(self, text):
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         with paddle.no_grad():
             embed = self.text_encoder(text_input.input_ids)[0]
         return embed
@@ -448,21 +426,23 @@ def get_noise(self, seed, dtype=paddle.float32, height=512, width=512):
         return paddle.randn(
             (1, self.unet.in_channels, height // 8, width // 8),
             generator=paddle.Generator().manual_seed(seed),
-            dtype=dtype, )
+            dtype=dtype,
+        )
 
     def walk(
-            self,
-            prompts: List[str],
-            seeds: List[int],
-            num_interpolation_steps: Optional[int]=6,
-            output_dir: Optional[str]="./dreams",
-            name: Optional[str]=None,
-            batch_size: Optional[int]=1,
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            guidance_scale: Optional[float]=7.5,
-            num_inference_steps: Optional[int]=50,
-            eta: Optional[float]=0.0, ) -> List[str]:
+        self,
+        prompts: List[str],
+        seeds: List[int],
+        num_interpolation_steps: Optional[int] = 6,
+        output_dir: Optional[str] = "./dreams",
+        name: Optional[str] = None,
+        batch_size: Optional[int] = 1,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        guidance_scale: Optional[float] = 7.5,
+        num_inference_steps: Optional[int] = 50,
+        eta: Optional[float] = 0.0,
+    ) -> List[str]:
         """
         Walks through a series of prompts and seeds, interpolating between them and saving the results to disk.
         Args:
@@ -509,8 +489,7 @@ def walk(
 
         frame_idx = 0
         frame_filepaths = []
-        for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:],
-                                                      seeds, seeds[1:]):
+        for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]):
             # Embed Text
             embed_a = self.embed_text(prompt_a)
             embed_b = self.embed_text(prompt_b)
@@ -526,14 +505,10 @@ def walk(
                 noise = slerp(float(t), noise_a, noise_b)
                 embed = paddle.lerp(embed_a, embed_b, t)
 
-                noise_batch = (noise if noise_batch is None else paddle.concat(
-                    [noise_batch, noise], axis=0))
-                embeds_batch = (embed
-                                if embeds_batch is None else paddle.concat(
-                                    [embeds_batch, embed], axis=0))
+                noise_batch = noise if noise_batch is None else paddle.concat([noise_batch, noise], axis=0)
+                embeds_batch = embed if embeds_batch is None else paddle.concat([embeds_batch, embed], axis=0)
 
-                batch_is_ready = (embeds_batch.shape[0] == batch_size or
-                                  i + 1 == T.shape[0])
+                batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
                 if batch_is_ready:
                     outputs = self(
                         latents=noise_batch,
@@ -542,12 +517,12 @@ def walk(
                         width=width,
                         guidance_scale=guidance_scale,
                         eta=eta,
-                        num_inference_steps=num_inference_steps, )
+                        num_inference_steps=num_inference_steps,
+                    )
                     noise_batch, embeds_batch = None, None
 
                     for image in outputs["images"]:
-                        frame_filepath = str(save_path /
-                                             f"frame_{frame_idx:06d}.png")
+                        frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png")
                         image.save(frame_filepath)
                         frame_filepaths.append(frame_filepath)
                         frame_idx += 1
diff --git a/ppdiffusers/examples/community/lpw_stable_diffusion.py b/ppdiffusers/examples/community/lpw_stable_diffusion.py
index 6870f3e68508a..c52d942b0b5a4 100644
--- a/ppdiffusers/examples/community/lpw_stable_diffusion.py
+++ b/ppdiffusers/examples/community/lpw_stable_diffusion.py
@@ -19,17 +19,18 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from paddlemix.utils.tools import compare_version
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipeline, StableDiffusionPipelineOutput)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
-                                    PNDMScheduler)
+    StableDiffusionPipeline,
+    StableDiffusionPipelineOutput,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ppdiffusers.utils import logging
 
 if compare_version(PIL.__version__, "9.1.0") >= 0:
@@ -55,7 +56,8 @@
 [^\\()\[\]:]+|
 :
 """,
-    re.X, )
+    re.X,
+)
 
 
 def parse_prompt_attention(text):
@@ -144,9 +146,7 @@ def multiply_range(start_position, multiplier):
     return res
 
 
-def get_prompts_with_weights(pipe: StableDiffusionPipeline,
-                             prompt: List[str],
-                             max_length: int):
+def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
     No padding, starting or ending token is included.
@@ -176,32 +176,20 @@ def get_prompts_with_weights(pipe: StableDiffusionPipeline,
         tokens.append(text_token)
         weights.append(text_weight)
     if truncated:
-        logger.warning(
-            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
-        )
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
 
 
-def pad_tokens_and_weights(tokens,
-                           weights,
-                           max_length,
-                           bos,
-                           eos,
-                           pad,
-                           no_boseos_middle=True,
-                           chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
     r"""
     Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
     """
     max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = (max_length if no_boseos_middle else
-                      max_embeddings_multiples * chunk_length)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
     for i in range(len(tokens)):
-        tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
-                     (max_length - 2 - len(tokens[i])))
+        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
         if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
-                                                       len(weights[i]))
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
         else:
             w = []
             if len(weights[i]) == 0:
@@ -209,8 +197,7 @@ def pad_tokens_and_weights(tokens,
             else:
                 for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2):min(
-                        len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
@@ -219,10 +206,11 @@ def pad_tokens_and_weights(tokens,
 
 
 def get_unweighted_text_embeddings(
-        pipe: StableDiffusionPipeline,
-        text_input: paddle.Tensor,
-        chunk_length: int,
-        no_boseos_middle: Optional[bool]=True, ):
+    pipe: StableDiffusionPipeline,
+    text_input: paddle.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
     it should be split into chunks and sent to the text encoder individually.
@@ -232,8 +220,7 @@ def get_unweighted_text_embeddings(
         text_embeddings = []
         for i in range(max_embeddings_multiples):
             # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
-                chunk_length - 2) + 2].clone()
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
 
             # cover the head and the tail by the starting and the ending tokens
             text_input_chunk[:, 0] = text_input[0, 0]
@@ -259,14 +246,15 @@ def get_unweighted_text_embeddings(
 
 
 def get_weighted_text_embeddings(
-        pipe: StableDiffusionPipeline,
-        prompt: Union[str, List[str]],
-        uncond_prompt: Optional[Union[str, List[str]]]=None,
-        max_embeddings_multiples: Optional[int]=1,
-        no_boseos_middle: Optional[bool]=False,
-        skip_parsing: Optional[bool]=False,
-        skip_weighting: Optional[bool]=False,
-        **kwargs, ):
+    pipe: StableDiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 1,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
+):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
     prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -290,24 +278,19 @@ def get_weighted_text_embeddings(
         skip_weighting (`bool`, *optional*, defaults to `False`):
             Skip the weighting. When the parsing is skipped, it is forced True.
     """
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     if isinstance(prompt, str):
         prompt = [prompt]
 
     if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
-                                                                 max_length - 2)
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
         if uncond_prompt is not None:
             if isinstance(uncond_prompt, str):
                 uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(
-                pipe, uncond_prompt, max_length - 2)
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
     else:
         prompt_tokens = [
-            token[1:-1]
-            for token in pipe.tokenizer(
-                prompt, max_length=max_length, truncation=True).input_ids
+            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
         ]
         prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
         if uncond_prompt is not None:
@@ -315,33 +298,26 @@ def get_weighted_text_embeddings(
                 uncond_prompt = [uncond_prompt]
             uncond_tokens = [
                 token[1:-1]
-                for token in pipe.tokenizer(
-                    uncond_prompt, max_length=max_length, truncation=True)
-                .input_ids
+                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
             ]
             uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
 
     # round up the longest length of tokens to a multiple of (model_max_length - 2)
     max_length = max([len(token) for token in prompt_tokens])
     if uncond_prompt is not None:
-        max_length = max(max_length,
-                         max([len(token) for token in uncond_tokens]))
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
 
     max_embeddings_multiples = min(
         max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
 
     # pad the length of tokens and weights
     # support bert tokenizer
-    bos = (pipe.tokenizer.bos_token_id
-           if pipe.tokenizer.bos_token_id is not None else
-           pipe.tokenizer.cls_token_id)
-    eos = (pipe.tokenizer.eos_token_id
-           if pipe.tokenizer.eos_token_id is not None else
-           pipe.tokenizer.sep_token_id)
+    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
     pad = pipe.tokenizer.pad_token_id
     prompt_tokens, prompt_weights = pad_tokens_and_weights(
         prompt_tokens,
@@ -351,7 +327,8 @@ def get_weighted_text_embeddings(
         eos,
         pad,
         no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length, )
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
     prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
     if uncond_prompt is not None:
         uncond_tokens, uncond_weights = pad_tokens_and_weights(
@@ -362,7 +339,8 @@ def get_weighted_text_embeddings(
             eos,
             pad,
             no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length, )
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
         uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
 
     # get the embeddings
@@ -370,32 +348,28 @@ def get_weighted_text_embeddings(
         pipe,
         prompt_tokens,
         pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle, )
-    prompt_weights = paddle.to_tensor(
-        prompt_weights, dtype=text_embeddings.dtype)
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
             pipe,
             uncond_tokens,
             pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle, )
-        uncond_weights = paddle.to_tensor(
-            uncond_weights, dtype=uncond_embeddings.dtype)
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype)
 
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
     if (not skip_parsing) and (not skip_weighting):
         previous_mean = text_embeddings.mean(axis=[-2, -1])
         text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (
-            (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1)
-            .unsqueeze(-1))
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
         if uncond_prompt is not None:
             previous_mean = uncond_embeddings.mean(axis=[-2, -1])
             uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (
-                (previous_mean / uncond_embeddings.mean(axis=[-2, -1]))
-                .unsqueeze(-1).unsqueeze(-1))
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
 
     if uncond_prompt is not None:
         return text_embeddings, uncond_embeddings
@@ -416,8 +390,7 @@ def preprocess_mask(mask, scale_factor=8):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize(
-        (w // scale_factor, h // scale_factor), resample=Resampling.NEAREST)
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=Resampling.NEAREST)
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
@@ -454,16 +427,16 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            requires_safety_checker: Optional[bool]=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: Optional[bool] = True,
+    ):
         super().__init__(
             vae=vae,
             text_encoder=text_encoder,
@@ -472,7 +445,8 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            requires_safety_checker=requires_safety_checker, )
+            requires_safety_checker=requires_safety_checker,
+        )
         self.__init__additional__()
 
     def __init__additional__(self):
@@ -480,10 +454,10 @@ def __init__additional__(self):
             setattr(
                 self,
                 "vae_scale_factor",
-                2**(len(self.vae.config.block_out_channels) - 1), )
+                2 ** (len(self.vae.config.block_out_channels) - 1),
+            )
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
         When this option is enabled, the attention module will split the input tensor in slices, to compute attention
@@ -510,34 +484,31 @@ def disable_attention_slicing(self):
 
     def check_inputs(self, prompt, height, width, strength, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     def _encode_prompt(
-            self,
-            prompt: Union[str, List[str]],
-            num_images_per_prompt: int,
-            do_classifier_free_guidance: bool,
-            negative_prompt: Union[str, List[str]],
-            max_embeddings_multiples: Optional[int]=3,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Union[str, List[str]],
+        max_embeddings_multiples: Optional[int] = 3,
+        **kwargs,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -564,28 +535,25 @@ def _encode_prompt(
             raise ValueError(
                 f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                 f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`.")
+                " the batch size of `prompt`."
+            )
 
         text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
             pipe=self,
             prompt=prompt,
-            uncond_prompt=negative_prompt
-            if do_classifier_free_guidance else None,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
             max_embeddings_multiples=max_embeddings_multiples,
-            **kwargs, )
+            **kwargs,
+        )
         bs_embed, seq_len, _ = text_embeddings.shape
         text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile(
-                [1, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
-            text_embeddings = paddle.concat(
-                [uncond_embeddings, text_embeddings])
+            uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1])
+            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
+            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
 
         return text_embeddings
 
@@ -602,29 +570,20 @@ def get_timesteps(self, num_inference_steps, strength, is_text2img):
             timesteps = self.scheduler.timesteps[t_start:]
             return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        height,
-                        width,
-                        dtype,
-                        generator,
-                        latents=None):
+    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None):
         if image is None:
             shape = (
                 batch_size,
                 self.unet.in_channels,
                 height // self.vae_scale_factor,
-                width // self.vae_scale_factor, )
+                width // self.vae_scale_factor,
+            )
 
             if latents is None:
                 latents = paddle.randn(shape, generator=generator, dtype=dtype)
             else:
                 if latents.shape != shape:
-                    raise ValueError(
-                        f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                    )
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
             # scale the initial noise by the standard deviation required by the scheduler
             latents = latents * self.scheduler.init_noise_sigma
@@ -644,27 +603,28 @@ def prepare_latents(self,
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[int]=7.5,
-            strength: Optional[int]=0.8,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[int]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: Optional[bool]=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            is_cancelled_callback: Optional[Callable[[], bool]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[int] = 7.5,
+        strength: Optional[int] = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[int] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: Optional[bool] = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
@@ -753,7 +713,8 @@ def __call__(
             num_images_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
-            max_embeddings_multiples, )
+            max_embeddings_multiples,
+        )
         dtype = text_embeddings.dtype
 
         # 4. Preprocess image and mask
@@ -765,17 +726,14 @@ def __call__(
             mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
         if mask_image is not None:
             mask_image = mask_image.astype(dtype=dtype)
-            mask = paddle.concat([mask_image] * batch_size *
-                                 num_images_per_prompt)
+            mask = paddle.concat([mask_image] * batch_size * num_images_per_prompt)
         else:
             mask = None
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(
-            num_inference_steps, strength, image is None)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
         latents, init_latents_orig, noise = self.prepare_latents(
@@ -786,7 +744,8 @@ def __call__(
             width,
             dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -794,46 +753,37 @@ def __call__(
         # 8. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input, t,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             if mask is not None:
                 # masking
-                init_latents_proper = self.scheduler.add_noise(
-                    init_latents_orig, noise, t)
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
             # call the callback, if provided
             if i % callback_steps == 0:
                 if callback is not None:
                     callback(i, t, latents)
-                if is_cancelled_callback is not None and is_cancelled_callback(
-                ):
+                if is_cancelled_callback is not None and is_cancelled_callback():
                     return None
 
         # 9. Post-processing
         image = self.decode_latents(latents)
 
         # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          text_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
 
         # 11. Convert to PIL
         if output_type == "pil":
@@ -842,28 +792,28 @@ def __call__(
         if not return_dict:
             return image, has_nsfw_concept
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     def text2img(
-            self,
-            prompt: Union[str, List[str]],
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[int]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: Optional[bool]=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            is_cancelled_callback: Optional[Callable[[], bool]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[int] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: Optional[bool] = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function for text-to-image generation.
         Args:
@@ -936,26 +886,28 @@ def text2img(
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs, )
+            **kwargs,
+        )
 
     def img2img(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            strength: Optional[float]=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: Optional[bool]=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            is_cancelled_callback: Optional[Callable[[], bool]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: Optional[float] = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: Optional[bool] = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function for image-to-image generation.
         Args:
@@ -1029,27 +981,29 @@ def img2img(
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            **kwargs, )
+            **kwargs,
+        )
 
     def inpaint(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            mask_image: Union[paddle.Tensor, PIL.Image.Image],
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            strength: Optional[float]=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: Optional[bool]=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            is_cancelled_callback: Optional[Callable[[], bool]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        mask_image: Union[paddle.Tensor, PIL.Image.Image],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: Optional[float] = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: Optional[bool] = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function for inpaint.
         Args:
@@ -1124,4 +1078,5 @@ def inpaint(
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs, )
+            **kwargs,
+        )
diff --git a/ppdiffusers/examples/community/mixture_tiling.py b/ppdiffusers/examples/community/mixture_tiling.py
index 62f8650648596..5ae0911810d10 100644
--- a/ppdiffusers/examples/community/mixture_tiling.py
+++ b/ppdiffusers/examples/community/mixture_tiling.py
@@ -23,17 +23,18 @@
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipeline_utils import DiffusionPipeline
 from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
-                                    PNDMScheduler)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ppdiffusers.utils import logging
 
 try:
     from ligo.segments import segment
-    from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                        CLIPTokenizer)
+    from paddlenlp.transformers import (
+        CLIPFeatureExtractor,
+        CLIPTextModel,
+        CLIPTokenizer,
+    )
 except ImportError:
-    raise ImportError(
-        "Please install paddlenlp and ligo-segments to use the mixture pipeline")
+    raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline")
 logger = logging.get_logger(__name__)
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -61,8 +62,7 @@
 """
 
 
-def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
-                        tile_row_overlap, tile_col_overlap):
+def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
     """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
 
     Returns a tuple with:
@@ -71,11 +71,9 @@ def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
         - Starting coordinates of columns in pixel space
         - Ending coordinates of columns in pixel space
     """
-    px_row_init = 0 if tile_row == 0 else tile_row * (
-        tile_height - tile_row_overlap)
+    px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
     px_row_end = px_row_init + tile_height
-    px_col_init = 0 if tile_col == 0 else tile_col * (
-        tile_width - tile_col_overlap)
+    px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
     px_col_end = px_col_init + tile_width
     return px_row_init, px_row_end, px_col_init, px_col_end
 
@@ -85,8 +83,7 @@ def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
     return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
 
 
-def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
-                         tile_row_overlap, tile_col_overlap):
+def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
     """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
 
     Returns a tuple with:
@@ -96,15 +93,14 @@ def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
         - Ending coordinates of columns in latent space
     """
     px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
-        tile_col_overlap)
-    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init,
-                                 px_col_end)
+        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+    )
+    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
 
 
-def _tile2latent_exclusive_indices(tile_row, tile_col, tile_width, tile_height,
-                                   tile_row_overlap, tile_col_overlap, rows,
-                                   columns):
+def _tile2latent_exclusive_indices(
+    tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns
+):
     """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
 
     Returns a tuple with:
@@ -114,18 +110,17 @@ def _tile2latent_exclusive_indices(tile_row, tile_col, tile_width, tile_height,
         - Ending coordinates of columns in latent space
     """
     row_init, row_end, col_init, col_end = _tile2latent_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
-        tile_col_overlap)
+        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+    )
     row_segment = segment(row_init, row_end)
     col_segment = segment(col_init, col_end)
     # Iterate over the rest of tiles, clipping the region for the current tile
     for row in range(rows):
         for column in range(columns):
             if row != tile_row and column != tile_col:
-                (clip_row_init, clip_row_end, clip_col_init,
-                 clip_col_end) = _tile2latent_indices(
-                     row, column, tile_width, tile_height, tile_row_overlap,
-                     tile_col_overlap)
+                (clip_row_init, clip_row_end, clip_col_init, clip_col_end) = _tile2latent_indices(
+                    row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                )
                 row_segment = row_segment - segment(clip_row_init, clip_row_end)
                 col_segment = col_segment - segment(clip_col_init, clip_col_end)
     # return row_init, row_end, col_init, col_end
@@ -151,17 +146,17 @@ def decode_latents(self, latents, cpu_vae=False):
         return self.numpy_to_pil(image)
 
 
-class StableDiffusionTilingPipeline(DiffusionPipeline,
-                                    StableDiffusionExtrasMixin):
+class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin):
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
         super().__init__()
         self.register_modules(
             vae=vae,
@@ -170,7 +165,8 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
 
     class SeedTilesMode(Enum):
         """Modes in which the latents of a particular tile can be re-seeded"""
@@ -180,22 +176,22 @@ class SeedTilesMode(Enum):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[List[str]]],
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            eta: Optional[float]=0.0,
-            seed: Optional[int]=None,
-            tile_height: Optional[int]=512,
-            tile_width: Optional[int]=512,
-            tile_row_overlap: Optional[int]=256,
-            tile_col_overlap: Optional[int]=256,
-            guidance_scale_tiles: Optional[List[List[float]]]=None,
-            seed_tiles: Optional[List[List[int]]]=None,
-            seed_tiles_mode: Optional[Union[str, List[List[str]]]]="full",
-            seed_reroll_regions: Optional[List[Tuple[int, int, int, int,
-                                                     int]]]=None,
-            cpu_vae: Optional[bool]=False, ):
+        self,
+        prompt: Union[str, List[List[str]]],
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        seed: Optional[int] = None,
+        tile_height: Optional[int] = 512,
+        tile_width: Optional[int] = 512,
+        tile_row_overlap: Optional[int] = 256,
+        tile_col_overlap: Optional[int] = 256,
+        guidance_scale_tiles: Optional[List[List[float]]] = None,
+        seed_tiles: Optional[List[List[int]]] = None,
+        seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
+        seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
+        cpu_vae: Optional[bool] = False,
+    ):
         """
         Function to run the diffusion pipeline with tiling support.
 
@@ -221,24 +217,18 @@ def __call__(
             A PIL image with the generated image.
 
         """
-        if not isinstance(prompt, list) or not all(
-                isinstance(row, list) for row in prompt):
-            raise ValueError(
-                f"`prompt` has to be a list of lists but is {type(prompt)}")
+        if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
+            raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
         grid_rows = len(prompt)
         grid_cols = len(prompt[0])
         if not all(len(row) == grid_cols for row in prompt):
-            raise ValueError(
-                "All prompt rows must have the same number of prompt columns")
+            raise ValueError("All prompt rows must have the same number of prompt columns")
         if not isinstance(seed_tiles_mode, str) and (
-                not isinstance(seed_tiles_mode, list) or
-                not all(isinstance(row, list) for row in seed_tiles_mode)):
-            raise ValueError(
-                f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}"
-            )
+            not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
+        ):
+            raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
         if isinstance(seed_tiles_mode, str):
-            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))]
-                               for row in prompt]
+            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
         modes = [mode.value for mode in self.SeedTilesMode]
         if any(mode not in modes for row in seed_tiles_mode for mode in row):
             raise ValueError(f"Seed tiles mode must be one of {modes}")
@@ -247,11 +237,9 @@ def __call__(
         batch_size = 1
 
         # create original noisy latents using the timesteps
-        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap
-                                                  )
+        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
         width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
-        latents_shape = (batch_size, self.unet.config.in_channels, height // 8,
-                         width // 8)
+        latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
         generator = paddle.Generator().manual_seed(seed)
         latents = paddle.randn(shape=latents_shape, generator=generator)
 
@@ -263,8 +251,8 @@ def __call__(
                         mode = seed_tiles_mode[row][col]
                         if mode == self.SeedTilesMode.FULL.value:
                             row_init, row_end, col_init, col_end = _tile2latent_indices(
-                                row, col, tile_width, tile_height,
-                                tile_row_overlap, tile_col_overlap)
+                                row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                            )
                         else:
                             row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices(
                                 row,
@@ -274,29 +262,27 @@ def __call__(
                                 tile_row_overlap,
                                 tile_col_overlap,
                                 grid_rows,
-                                grid_cols, )
-                        tile_generator = paddle.Generator().manual_seed(
-                            seed_tile)
-                        tile_shape = latents_shape[0], latents_shape[
-                            1], row_end - row_init, col_end - col_init
-                        latents[:, :, row_init:row_end, col_init:
-                                col_end] = paddle.randn(
-                                    shape=tile_shape, generator=tile_generator)
+                                grid_cols,
+                            )
+                        tile_generator = paddle.Generator().manual_seed(seed_tile)
+                        tile_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
+                        latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
+                            shape=tile_shape, generator=tile_generator
+                        )
 
         # overwrite again for seed reroll regions
         for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
             row_init, row_end, col_init, col_end = _pixel2latent_indices(
-                row_init, row_end, col_init,
-                col_end)  # to latent space coordinates
+                row_init, row_end, col_init, col_end
+            )  # to latent space coordinates
             reroll_generator = paddle.Generator().manual_seed(seed_reroll)
-            region_shape = latents_shape[0], latents_shape[
-                1], row_end - row_init, col_end - col_init
+            region_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
             latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
-                shape=region_shape, generator=reroll_generator)
+                shape=region_shape, generator=reroll_generator
+            )
 
         # Prepare scheduler
-        accepts_offset = "offset" in set(
-            inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
         extra_set_kwargs = {}
         if accepts_offset:
             extra_set_kwargs["offset"] = 1
@@ -306,17 +292,20 @@ def __call__(
             latents = latents * self.scheduler.sigmas[0]
 
         # get prompts text embeddings
-        text_input = [[
-            self.tokenizer(
-                col,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd", ) for col in row
-        ] for row in prompt]
-        text_embeddings = [[
-            self.text_encoder(col.input_ids)[0] for col in row
-        ] for row in text_input]
+        text_input = [
+            [
+                self.tokenizer(
+                    col,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pd",
+                )
+                for col in row
+            ]
+            for row in prompt
+        ]
+        text_embeddings = [[self.text_encoder(col.input_ids)[0] for col in row] for row in text_input]
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -328,32 +317,26 @@ def __call__(
                 for j in range(grid_cols):
                     max_length = text_input[i][j].input_ids.shape[-1]
                     uncond_input = self.tokenizer(
-                        [""] * batch_size,
-                        padding="max_length",
-                        max_length=max_length,
-                        return_tensors="pd")
-                    uncond_embeddings = self.text_encoder(
-                        uncond_input.input_ids)[0]
+                        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pd"
+                    )
+                    uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
 
                     # For classifier free guidance, we need to do two forward passes.
                     # Here we concatenate the unconditional and text embeddings into a single batch
                     # to avoid doing two forward passes
-                    text_embeddings[i][j] = paddle.concat(
-                        x=[uncond_embeddings, text_embeddings[i][j]])
+                    text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]])
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # Mask for tile weights strenght
-        tile_weights = self._gaussian_weights(tile_width, tile_height,
-                                              batch_size)
+        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
 
         # Diffusion timesteps
         for i, t in tqdm(enumerate(self.scheduler.timesteps)):
@@ -363,33 +346,28 @@ def __call__(
                 noise_preds_row = []
                 for col in range(grid_cols):
                     px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
-                        row, col, tile_width, tile_height, tile_row_overlap,
-                        tile_col_overlap)
-                    tile_latents = latents[:, :, px_row_init:px_row_end,
-                                           px_col_init:px_col_end]
+                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                    )
+                    tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat(x=[tile_latents] * 2)
-                                          if do_classifier_free_guidance else
-                                          tile_latents)
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = (
+                        paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                     # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=text_embeddings[row][col])[
-                            "sample"]
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[
+                        "sample"
+                    ]
                     # perform guidance
                     if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(
-                            chunks=2)
-                        guidance = (guidance_scale
-                                    if guidance_scale_tiles is None or
-                                    guidance_scale_tiles[row][col] is None else
-                                    guidance_scale_tiles[row][col])
-                        noise_pred_tile = noise_pred_uncond + guidance * (
-                            noise_pred_text - noise_pred_uncond)
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+                        guidance = (
+                            guidance_scale
+                            if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
+                            else guidance_scale_tiles[row][col]
+                        )
+                        noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
                         noise_preds_row.append(noise_pred_tile)
                 noise_preds.append(noise_preds_row)
             # Stitch noise predictions for all tiles
@@ -399,13 +377,12 @@ def __call__(
             for row in range(grid_rows):
                 for col in range(grid_cols):
                     px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
-                        row, col, tile_width, tile_height, tile_row_overlap,
-                        tile_col_overlap)
-                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:
-                               px_col_end] += (noise_preds[row][col] *
-                                               tile_weights)
-                    contributors[:, :, px_row_init:px_row_end, px_col_init:
-                                 px_col_end] += tile_weights
+                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+                    )
+                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
+                        noise_preds[row][col] * tile_weights
+                    )
+                    contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
             # Average overlapping areas with more than 1 contributor
             noise_pred /= contributors
             # compute the previous noisy sample x_t -> x_t-1
@@ -424,14 +401,16 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches):
         latent_height = tile_height // 8
         var = 0.01
         midpoint = (latent_width - 1) / 2
-        x_probs = [(exp(-(x - midpoint) * (x - midpoint) /
-                        (latent_width * latent_width) / (2 * var)) /
-                    sqrt(2 * pi * var)) for x in range(latent_width)]
+        x_probs = [
+            (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var))
+            for x in range(latent_width)
+        ]
         midpoint = latent_height / 2
-        y_probs = [(exp(-(y - midpoint) * (y - midpoint) /
-                        (latent_height * latent_height) / (2 * var)) /
-                    sqrt(2 * pi * var)) for y in range(latent_height)]
+        y_probs = [
+            (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var))
+            for y in range(latent_height)
+        ]
         weights = np.outer(y_probs, x_probs)
         return paddle.tile(
-            x=paddle.to_tensor(data=weights),
-            repeat_times=(nbatches, self.unet.config.in_channels, 1, 1))
+            x=paddle.to_tensor(data=weights), repeat_times=(nbatches, self.unet.config.in_channels, 1, 1)
+        )
diff --git a/ppdiffusers/examples/community/one_step_unet.py b/ppdiffusers/examples/community/one_step_unet.py
index 489cef26e01d8..5baffefdab061 100644
--- a/ppdiffusers/examples/community/one_step_unet.py
+++ b/ppdiffusers/examples/community/one_step_unet.py
@@ -24,15 +24,14 @@ def __init__(self, unet, scheduler):
         self.register_modules(unet=unet, scheduler=scheduler)
 
     def __call__(self):
-        image = paddle.randn((1, self.unet.in_channels, self.unet.sample_size,
-                              self.unet.sample_size), )
+        image = paddle.randn(
+            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+        )
         timestep = 1
 
         model_output = self.unet(image, timestep).sample
-        scheduler_output = self.scheduler.step(model_output, timestep,
-                                               image).prev_sample
+        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
 
-        result = (scheduler_output - scheduler_output +
-                  paddle.ones_like(scheduler_output))
+        result = scheduler_output - scheduler_output + paddle.ones_like(scheduler_output)
 
         return result
diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
index 218ef8d7ab49c..b32b422bd47ae 100644
--- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
+++ b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
@@ -23,17 +23,17 @@
 
 from ppdiffusers import DiffusionPipeline
 from ppdiffusers.pipelines.fastdeploy_utils import (
-    FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel)
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
+    FastDeployDiffusionPipelineMixin,
+    FastDeployRuntimeModel,
+)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
 from ppdiffusers.utils import logging, randn_tensor
 
 logger = logging.get_logger(__name__)
 
 
-class FastStableDiffusionHiresFixPipeline(DiffusionPipeline,
-                                          FastDeployDiffusionPipelineMixin):
+class FastStableDiffusionHiresFixPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline for text-to-image generation with high resolution fixing(hires.fix) based on Stable Diffusion.
 
@@ -63,21 +63,20 @@ class FastStableDiffusionHiresFixPipeline(DiffusionPipeline,
         feature_extractor ([`CLIPImageProcessor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
-    _optional_components = [
-        "vae_encoder", "safety_checker", "feature_extractor"
-    ]
+    _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -102,7 +101,8 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
 
@@ -111,7 +111,7 @@ def get_timesteps(self, denoising_steps, denoising_strength):
         self.scheduler.set_timesteps(steps)
 
         t_start = max(steps - denoising_steps, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         if hasattr(self.scheduler, "step_index_offset"):
             self.scheduler.step_index_offset = t_start * self.scheduler.order
@@ -119,48 +119,45 @@ def get_timesteps(self, denoising_steps, denoising_strength):
         return timesteps.cast("float32"), denoising_steps
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            hr_scale,
-            hr_resize_height,
-            hr_resize_width,
-            denoising_strength,
-            latent_scale_mode,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        hr_scale,
+        hr_resize_height,
+        hr_resize_width,
+        denoising_strength,
+        latent_scale_mode,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if hr_scale < 0:
-            raise ValueError(
-                "hr_scale shoule be greater that 0, but acceived {hr_scale}")
+            raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
 
         if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
             raise ValueError(
@@ -168,9 +165,7 @@ def check_inputs(
             )
 
         if denoising_strength > 1 or denoising_strength < 0:
-            raise ValueError(
-                f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}"
-            )
+            raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -188,14 +183,10 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
-
-    def get_upscaled_width_and_height(self,
-                                      width,
-                                      height,
-                                      hr_scale=2,
-                                      hr_resize_width=0,
-                                      hr_resize_height=0):
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
         if hr_resize_width == 0 and hr_resize_height == 0:
             hr_upscale_to_width = int(width * hr_scale)
             hr_upscale_to_height = int(height * hr_scale)
@@ -221,36 +212,36 @@ def get_upscaled_width_and_height(self,
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=40,
-            hires_ratio: Optional[float]=0.5,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            enable_hr: Optional[bool]=True,
-            hr_scale: Optional[float]=2.0,
-            hr_resize_width: Optional[int]=0,
-            hr_resize_height: Optional[int]=0,
-            denoising_strength: Optional[float]=0.7,
-            latent_scale_mode: Optional[str]="nearest",
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 40,
+        hires_ratio: Optional[float] = 0.5,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        enable_hr: Optional[bool] = True,
+        hr_scale: Optional[float] = 2.0,
+        hr_resize_width: Optional[int] = 0,
+        hr_resize_height: Optional[int] = 0,
+        denoising_strength: Optional[float] = 0.7,
+        latent_scale_mode: Optional[str] = "nearest",
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -347,7 +338,8 @@ def __call__(
             latent_scale_mode,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
 
         # 2. Define call parameters
@@ -373,7 +365,8 @@ def __call__(
                 height=height,
                 batch_size=batch_size,
                 num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance, )
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
 
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
@@ -385,7 +378,8 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 4. Prepare timesteps
         if enable_hr:
@@ -401,18 +395,17 @@ def __call__(
         # 5. Prepare latent variables
         if generator is None:
             generator_state = paddle.get_cuda_rng_state()
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(
-                generator_state)
+            paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
         else:
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(
-                paddle.Generator().states_[generator])
+            paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
 
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             height,
             width,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -423,34 +416,29 @@ def __call__(
         with self.progress_bar(total=sample_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 unet_inputs = dict(
                     sample=latent_model_input,
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape, )
+                    output_shape=latent_model_input.shape,
+                )
                 if do_controlnet:
                     unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs[
-                        "controlnet_conditioning_scale"] = control_conditioning_scale
+                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
                 # predict the noise residual
                 noise_pred_unet = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
 
@@ -462,15 +450,13 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -483,19 +469,16 @@ def __call__(
             # 8. determine the upscaled width and height for upscaled images
             truncate_width = 0
             truncate_height = 0
-            (
-                hr_upscale_to_width,
-                hr_upscale_to_height, ) = self.get_upscaled_width_and_height(
-                    width,
-                    height,
-                    hr_scale=hr_scale,
-                    hr_resize_width=hr_resize_width,
-                    hr_resize_height=hr_resize_height, )
+            (hr_upscale_to_width, hr_upscale_to_height,) = self.get_upscaled_width_and_height(
+                width,
+                height,
+                hr_scale=hr_scale,
+                hr_resize_width=hr_resize_width,
+                hr_resize_height=hr_resize_height,
+            )
             if hr_resize_width != 0 and hr_resize_height != 0:
-                truncate_width = (hr_upscale_to_width - hr_resize_width
-                                  ) // self.vae_scale_factor
-                truncate_height = (hr_upscale_to_height - hr_resize_height
-                                   ) // self.vae_scale_factor
+                truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
+                truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
 
             # 9. special case: do nothing if upscaling is not nesscessary
             if hr_upscale_to_width == width and hr_upscale_to_height == height:
@@ -504,77 +487,69 @@ def __call__(
 
         if enable_hr:
             if do_controlnet:
-                (
-                    control_image,
-                    control_conditioning_scale,
-                ) = self.prepare_controlnet_cond(
+                (control_image, control_conditioning_scale,) = self.prepare_controlnet_cond(
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=controlnet_conditioning_scale,
                     width=hr_upscale_to_width,
                     height=hr_upscale_to_height,
                     batch_size=batch_size,
                     num_images_per_prompt=num_images_per_prompt,
-                    do_classifier_free_guidance=do_classifier_free_guidance, )
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                )
 
             # 10. prepare init latents
-            timesteps, hr_steps = self.get_timesteps(hr_steps,
-                                                     denoising_strength)
+            timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength)
             init_timestep = timesteps[:1].tile([latents.shape[0]])
 
             latents = F.interpolate(
                 latents,
                 size=(
                     hr_upscale_to_height // self.vae_scale_factor,
-                    hr_upscale_to_width // self.vae_scale_factor, ),
-                mode=latent_scale_mode, )
-            latents = latents[:, :, truncate_height // 2:latents.shape[2] - (
-                truncate_height + 1) // 2, truncate_width // 2:latents.shape[3]
-                              - (truncate_width + 1) // 2, ]
-
-            noise = randn_tensor(
-                latents.shape,
-                dtype=latents.dtype,
-                generator="initial_generator")
+                    hr_upscale_to_width // self.vae_scale_factor,
+                ),
+                mode=latent_scale_mode,
+            )
+            latents = latents[
+                :,
+                :,
+                truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
+                truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
+            ]
+
+            noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
             latents = self.scheduler.add_noise(latents, noise, init_timestep)
 
             # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(
-                "initial_generator", eta)
+            extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
 
             # 12. denoising on hires.fix steps
             num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
             with self.progress_bar(total=hr_steps) as progress_bar:
                 for i, t in enumerate(timesteps):
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat([latents] * 2)
-                                          if do_classifier_free_guidance else
-                                          latents)
+                    latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                     if is_scheduler_support_step_index:
-                        latent_model_input = self.scheduler.scale_model_input(
-                            latent_model_input, t, step_index=i)
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                     else:
-                        latent_model_input = self.scheduler.scale_model_input(
-                            latent_model_input, t)
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                     unet_inputs = dict(
                         sample=latent_model_input,
                         timestep=t,
                         encoder_hidden_states=prompt_embeds,
                         infer_op=infer_op_dict.get("unet", None),
-                        output_shape=latent_model_input.shape, )
+                        output_shape=latent_model_input.shape,
+                    )
                     if do_controlnet:
                         unet_inputs["controlnet_cond"] = control_image
-                        unet_inputs[
-                            "controlnet_conditioning_scale"] = control_conditioning_scale
+                        unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
                     # predict the noise residual
                     noise_pred_unet = self.unet(**unet_inputs)[0]
 
                     # perform guidance
                     if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                            2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (
-                            noise_pred_text - noise_pred_uncond)
+                        noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                     else:
                         noise_pred = noise_pred_unet
 
@@ -586,16 +561,14 @@ def __call__(
                             latents,
                             step_index=i,
                             return_pred_original_sample=False,
-                            **extra_step_kwargs, )
+                            **extra_step_kwargs,
+                        )
                     else:
-                        scheduler_output = self.scheduler.step(
-                            noise_pred, t, latents, **extra_step_kwargs)
+                        scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                     latents = scheduler_output.prev_sample
 
                     # call the callback, if provided
-                    if i == len(timesteps) - 1 or (
-                        (i + 1) > num_warmup_steps and
-                        (i + 1) % self.scheduler.order == 0):
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                         progress_bar.update()
                         if callback is not None and i % callback_steps == 0:
                             callback(i, t, latents)
@@ -606,7 +579,8 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
             image, has_nsfw_concept = self.run_safety_checker(image)
         else:
             image = latents
@@ -617,11 +591,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
index bf9bbf48e6e90..2fb5aa69a20ee 100644
--- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
+++ b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
@@ -24,10 +24,12 @@
 # from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipeline_utils import DiffusionPipeline
 from ppdiffusers.pipelines.fastdeploy_utils import (
-    FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel)
+    FastDeployDiffusionPipelineMixin,
+    FastDeployRuntimeModel,
+)
+
 # from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
-                                    PNDMScheduler)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ppdiffusers.utils import logging
 
 try:
@@ -35,13 +37,11 @@
     from paddlenlp.transformers import CLIPFeatureExtractor  # CLIPTextModel,
     from paddlenlp.transformers import CLIPTokenizer
 except ImportError:
-    raise ImportError(
-        "Please install paddlenlp and ligo-segments to use the mixture pipeline")
+    raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline")
 logger = logging.get_logger(__name__)
 
 
-def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
-                        tile_row_overlap, tile_col_overlap):
+def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
     """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
 
     Returns a tuple with:
@@ -50,11 +50,9 @@ def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height,
         - Starting coordinates of columns in pixel space
         - Ending coordinates of columns in pixel space
     """
-    px_row_init = 0 if tile_row == 0 else tile_row * (
-        tile_height - tile_row_overlap)
+    px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
     px_row_end = px_row_init + tile_height
-    px_col_init = 0 if tile_col == 0 else tile_col * (
-        tile_width - tile_col_overlap)
+    px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
     px_col_end = px_col_init + tile_width
     return px_row_init, px_row_end, px_col_init, px_col_end
 
@@ -64,8 +62,7 @@ def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
     return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
 
 
-def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
-                         tile_row_overlap, tile_col_overlap):
+def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
     """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
 
     Returns a tuple with:
@@ -75,21 +72,21 @@ def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height,
         - Ending coordinates of columns in latent space
     """
     px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
-        tile_col_overlap)
-    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init,
-                                 px_col_end)
+        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+    )
+    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
 
 
 def _tile2latent_exclusive_indices(
-        tile_row,
-        tile_col,
-        tile_width,
-        tile_height,
-        tile_row_overlap,
-        tile_col_overlap,
-        rows,
-        columns, ):
+    tile_row,
+    tile_col,
+    tile_width,
+    tile_height,
+    tile_row_overlap,
+    tile_col_overlap,
+    rows,
+    columns,
+):
     """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
 
     Returns a tuple with:
@@ -99,25 +96,22 @@ def _tile2latent_exclusive_indices(
         - Ending coordinates of columns in latent space
     """
     row_init, row_end, col_init, col_end = _tile2latent_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap,
-        tile_col_overlap)
+        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
+    )
     row_segment = segment(row_init, row_end)
     col_segment = segment(col_init, col_end)
     # Iterate over the rest of tiles, clipping the region for the current tile
     for row in range(rows):
         for column in range(columns):
             if row != tile_row and column != tile_col:
-                (
-                    clip_row_init,
-                    clip_row_end,
-                    clip_col_init,
-                    clip_col_end, ) = _tile2latent_indices(
-                        row,
-                        column,
-                        tile_width,
-                        tile_height,
-                        tile_row_overlap,
-                        tile_col_overlap, )
+                (clip_row_init, clip_row_end, clip_col_init, clip_col_end,) = _tile2latent_indices(
+                    row,
+                    column,
+                    tile_width,
+                    tile_height,
+                    tile_row_overlap,
+                    tile_col_overlap,
+                )
                 row_segment = row_segment - segment(clip_row_init, clip_row_end)
                 col_segment = col_segment - segment(clip_col_init, clip_col_end)
     # return row_init, row_end, col_init, col_end
@@ -127,10 +121,7 @@ def _tile2latent_exclusive_indices(
 class StableDiffusionExtrasMixin:
     """Mixin providing additional convenience method to Stable Diffusion pipelines"""
 
-    def _decode_vae_latents(self,
-                            latents: paddle.Tensor,
-                            infer_op=None,
-                            **kwargs):
+    def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs):
         latents_shape = latents.shape
         output_shape = [
             latents_shape[0],
@@ -143,7 +134,8 @@ def _decode_vae_latents(self,
         images_vae = self.vae_decoder(
             latent_sample=latents,
             infer_op=infer_op,
-            output_shape=output_shape, )[0]
+            output_shape=output_shape,
+        )[0]
 
         return images_vae
 
@@ -163,19 +155,20 @@ def decode_latents(self, latents, cpu_vae=False):
         return self.numpy_to_pil(image)
 
 
-class FastDeployStableDiffusionTilingPipeline(DiffusionPipeline,
-                                              StableDiffusionExtrasMixin,
-                                              FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionTilingPipeline(
+    DiffusionPipeline, StableDiffusionExtrasMixin, FastDeployDiffusionPipelineMixin
+):
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler],
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPFeatureExtractor, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler],
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
         super().__init__()
         self.register_modules(
             vae_encoder=vae_encoder,
@@ -185,7 +178,8 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.post_init()
 
     class SeedTilesMode(Enum):
@@ -196,24 +190,24 @@ class SeedTilesMode(Enum):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[List[str]]],
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            eta: Optional[float]=0.0,
-            seed: Optional[int]=None,
-            tile_height: Optional[int]=512,
-            tile_width: Optional[int]=512,
-            tile_row_overlap: Optional[int]=256,
-            tile_col_overlap: Optional[int]=256,
-            guidance_scale_tiles: Optional[List[List[float]]]=None,
-            seed_tiles: Optional[List[List[int]]]=None,
-            seed_tiles_mode: Optional[Union[str, List[List[str]]]]="full",
-            seed_reroll_regions: Optional[List[Tuple[int, int, int, int,
-                                                     int]]]=None,
-            # parse_prompt_type: Optional[str] = "lpw",
-            # max_embeddings_multiples: Optional[int] = 3,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[List[str]]],
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        seed: Optional[int] = None,
+        tile_height: Optional[int] = 512,
+        tile_width: Optional[int] = 512,
+        tile_row_overlap: Optional[int] = 256,
+        tile_col_overlap: Optional[int] = 256,
+        guidance_scale_tiles: Optional[List[List[float]]] = None,
+        seed_tiles: Optional[List[List[int]]] = None,
+        seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
+        seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
+        # parse_prompt_type: Optional[str] = "lpw",
+        # max_embeddings_multiples: Optional[int] = 3,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         """
         Function to run the diffusion pipeline with tiling support.
 
@@ -244,24 +238,18 @@ def __call__(
         """
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
 
-        if not isinstance(prompt, list) or not all(
-                isinstance(row, list) for row in prompt):
-            raise ValueError(
-                f"`prompt` has to be a list of lists but is {type(prompt)}")
+        if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
+            raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
         grid_rows = len(prompt)
         grid_cols = len(prompt[0])
         if not all(len(row) == grid_cols for row in prompt):
-            raise ValueError(
-                "All prompt rows must have the same number of prompt columns")
+            raise ValueError("All prompt rows must have the same number of prompt columns")
         if not isinstance(seed_tiles_mode, str) and (
-                not isinstance(seed_tiles_mode, list) or
-                not all(isinstance(row, list) for row in seed_tiles_mode)):
-            raise ValueError(
-                f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}"
-            )
+            not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
+        ):
+            raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
         if isinstance(seed_tiles_mode, str):
-            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))]
-                               for row in prompt]
+            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
         modes = [mode.value for mode in self.SeedTilesMode]
         if any(mode not in modes for row in seed_tiles_mode for mode in row):
             raise ValueError(f"Seed tiles mode must be one of {modes}")
@@ -270,14 +258,14 @@ def __call__(
         batch_size = 1
 
         # create original noisy latents using the timesteps
-        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap
-                                                  )
+        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
         width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
         latents_shape = (
             batch_size,
             self.vae_decoder_num_latent_channels,
             height // 8,
-            width // 8, )
+            width // 8,
+        )
         generator = paddle.Generator().manual_seed(seed)
         latents = paddle.randn(shape=latents_shape, generator=generator)
 
@@ -295,49 +283,48 @@ def __call__(
                                 tile_width,
                                 tile_height,
                                 tile_row_overlap,
-                                tile_col_overlap, )
+                                tile_col_overlap,
+                            )
                         else:
-                            (
-                                row_init,
-                                row_end,
-                                col_init,
-                                col_end, ) = _tile2latent_exclusive_indices(
-                                    row,
-                                    col,
-                                    tile_width,
-                                    tile_height,
-                                    tile_row_overlap,
-                                    tile_col_overlap,
-                                    grid_rows,
-                                    grid_cols, )
-                        tile_generator = paddle.Generator().manual_seed(
-                            seed_tile)
+                            (row_init, row_end, col_init, col_end,) = _tile2latent_exclusive_indices(
+                                row,
+                                col,
+                                tile_width,
+                                tile_height,
+                                tile_row_overlap,
+                                tile_col_overlap,
+                                grid_rows,
+                                grid_cols,
+                            )
+                        tile_generator = paddle.Generator().manual_seed(seed_tile)
                         tile_shape = (
                             latents_shape[0],
                             latents_shape[1],
                             row_end - row_init,
-                            col_end - col_init, )
-                        latents[:, :, row_init:row_end, col_init:
-                                col_end] = paddle.randn(
-                                    shape=tile_shape, generator=tile_generator)
+                            col_end - col_init,
+                        )
+                        latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
+                            shape=tile_shape, generator=tile_generator
+                        )
 
         # overwrite again for seed reroll regions
         for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
             row_init, row_end, col_init, col_end = _pixel2latent_indices(
-                row_init, row_end, col_init,
-                col_end)  # to latent space coordinates
+                row_init, row_end, col_init, col_end
+            )  # to latent space coordinates
             reroll_generator = paddle.Generator().manual_seed(seed_reroll)
             region_shape = (
                 latents_shape[0],
                 latents_shape[1],
                 row_end - row_init,
-                col_end - col_init, )
+                col_end - col_init,
+            )
             latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
-                shape=region_shape, generator=reroll_generator)
+                shape=region_shape, generator=reroll_generator
+            )
 
         # Prepare scheduler
-        accepts_offset = "offset" in set(
-            inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
         extra_set_kwargs = {}
         if accepts_offset:
             extra_set_kwargs["offset"] = 1
@@ -347,18 +334,22 @@ def __call__(
             latents = latents * self.scheduler.sigmas[0]
 
         # get prompts text embeddings
-        text_input = [[
-            self.tokenizer(
-                col,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd", ) for col in row
-        ] for row in prompt]
-        text_embeddings = [[
-            self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0]
-            for col in row
-        ] for row in text_input]
+        text_input = [
+            [
+                self.tokenizer(
+                    col,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pd",
+                )
+                for col in row
+            ]
+            for row in prompt
+        ]
+        text_embeddings = [
+            [self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0] for col in row] for row in text_input
+        ]
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -373,29 +364,26 @@ def __call__(
                         [""] * batch_size,
                         padding="max_length",
                         max_length=max_length,
-                        return_tensors="pd", )
-                    uncond_embeddings = self.text_encoder(
-                        input_ids=uncond_input.input_ids.astype(np.int64))[0]
+                        return_tensors="pd",
+                    )
+                    uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int64))[0]
 
                     # For classifier free guidance, we need to do two forward passes.
                     # Here we concatenate the unconditional and text embeddings into a single batch
                     # to avoid doing two forward passes
-                    text_embeddings[i][j] = paddle.concat(
-                        x=[uncond_embeddings, text_embeddings[i][j]])
+                    text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]])
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # Mask for tile weights strenght
-        tile_weights = self._gaussian_weights(tile_width, tile_height,
-                                              batch_size)
+        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
 
         # Diffusion timesteps
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
@@ -406,48 +394,42 @@ def __call__(
             for row in range(grid_rows):
                 noise_preds_row = []
                 for col in range(grid_cols):
-                    (
-                        px_row_init,
-                        px_row_end,
-                        px_col_init,
-                        px_col_end, ) = _tile2latent_indices(
-                            row,
-                            col,
-                            tile_width,
-                            tile_height,
-                            tile_row_overlap,
-                            tile_col_overlap, )
-                    tile_latents = latents[:, :, px_row_init:px_row_end,
-                                           px_col_init:px_col_end]
+                    (px_row_init, px_row_end, px_col_init, px_col_end,) = _tile2latent_indices(
+                        row,
+                        col,
+                        tile_width,
+                        tile_height,
+                        tile_row_overlap,
+                        tile_col_overlap,
+                    )
+                    tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat(x=[tile_latents] * 2)
-                                          if do_classifier_free_guidance else
-                                          tile_latents)
+                    latent_model_input = (
+                        paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents
+                    )
                     if is_scheduler_support_step_index:
-                        latent_model_input = self.scheduler.scale_model_input(
-                            latent_model_input, t, step_index=i)
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                     else:
-                        latent_model_input = self.scheduler.scale_model_input(
-                            latent_model_input, t)
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     # predict the noise residual
                     unet_inputs = dict(
                         sample=latent_model_input,
                         timestep=t,
                         encoder_hidden_states=text_embeddings[row][col],
                         infer_op=infer_op_dict.get("unet", None),
-                        output_shape=latent_model_input.shape, )
+                        output_shape=latent_model_input.shape,
+                    )
                     noise_pred = self.unet(**unet_inputs)[0]
 
                     # perform guidance
                     if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(
-                            chunks=2)
-                        guidance = (guidance_scale
-                                    if guidance_scale_tiles is None or
-                                    guidance_scale_tiles[row][col] is None else
-                                    guidance_scale_tiles[row][col])
-                        noise_pred_tile = noise_pred_uncond + guidance * (
-                            noise_pred_text - noise_pred_uncond)
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+                        guidance = (
+                            guidance_scale
+                            if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
+                            else guidance_scale_tiles[row][col]
+                        )
+                        noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
                         noise_preds_row.append(noise_pred_tile)
                 noise_preds.append(noise_preds_row)
             # Stitch noise predictions for all tiles
@@ -456,22 +438,18 @@ def __call__(
             # Add each tile contribution to overall latents
             for row in range(grid_rows):
                 for col in range(grid_cols):
-                    (
-                        px_row_init,
-                        px_row_end,
-                        px_col_init,
-                        px_col_end, ) = _tile2latent_indices(
-                            row,
-                            col,
-                            tile_width,
-                            tile_height,
-                            tile_row_overlap,
-                            tile_col_overlap, )
-                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:
-                               px_col_end] += (noise_preds[row][col] *
-                                               tile_weights)
-                    contributors[:, :, px_row_init:px_row_end, px_col_init:
-                                 px_col_end] += tile_weights
+                    (px_row_init, px_row_end, px_col_init, px_col_end,) = _tile2latent_indices(
+                        row,
+                        col,
+                        tile_width,
+                        tile_height,
+                        tile_row_overlap,
+                        tile_col_overlap,
+                    )
+                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
+                        noise_preds[row][col] * tile_weights
+                    )
+                    contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
             # Average overlapping areas with more than 1 contributor
             noise_pred /= contributors
             # compute the previous noisy sample x_t -> x_t-1
@@ -481,10 +459,10 @@ def __call__(
                     t,
                     latents,
                     step_index=i,
-                    return_pred_original_sample=False, ).prev_sample
+                    return_pred_original_sample=False,
+                ).prev_sample
             else:
-                latents = self.scheduler.step(noise_pred, t,
-                                              latents).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
             if i == len(self.scheduler.timesteps) - 1:
                 # sync for accuracy it/s measure
                 paddle.device.cuda.synchronize()
@@ -505,13 +483,15 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches):
         latent_height = tile_height // 8
         var = 0.01
         midpoint = (latent_width - 1) / 2
-        x_probs = [(exp(-(x - midpoint) * (x - midpoint) /
-                        (latent_width * latent_width) / (2 * var)) /
-                    sqrt(2 * pi * var)) for x in range(latent_width)]
+        x_probs = [
+            (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var))
+            for x in range(latent_width)
+        ]
         midpoint = latent_height / 2
-        y_probs = [(exp(-(y - midpoint) * (y - midpoint) /
-                        (latent_height * latent_height) / (2 * var)) /
-                    sqrt(2 * pi * var)) for y in range(latent_height)]
+        y_probs = [
+            (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var))
+            for y in range(latent_height)
+        ]
         weights = np.outer(y_probs, x_probs)
         return paddle.tile(
             x=paddle.to_tensor(data=weights),
diff --git a/ppdiffusers/examples/community/reference_only.py b/ppdiffusers/examples/community/reference_only.py
index 7f3035e62a6ea..816ee95647862 100644
--- a/ppdiffusers/examples/community/reference_only.py
+++ b/ppdiffusers/examples/community/reference_only.py
@@ -20,24 +20,32 @@
 import paddle
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 from PIL import Image
 
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.models.cross_attention import CrossAttention
 from ppdiffusers.models.transformer_2d import Transformer2DModelOutput
-from ppdiffusers.models.unet_2d_blocks import (ResnetBlock2D,
-                                               Transformer2DModel, Upsample2D)
+from ppdiffusers.models.unet_2d_blocks import (
+    ResnetBlock2D,
+    Transformer2DModel,
+    Upsample2D,
+)
 from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (PIL_INTERPOLATION, check_min_version, deprecate,
-                               logging, randn_tensor, replace_example_docstring)
+from ppdiffusers.utils import (
+    PIL_INTERPOLATION,
+    check_min_version,
+    deprecate,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 
 check_min_version("0.14.1")
 
@@ -70,18 +78,14 @@
 def stable_var(x, axis=None, unbiased=True, keepdim=False, name=None):
     dtype = x.dtype
     u = paddle.mean(x, axis=axis, keepdim=True, name=name)
-    n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(
-        paddle.numel(u), paddle.int64)
+    n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(paddle.numel(u), paddle.int64)
     n = n.astype(dtype)
     if unbiased:
         one_const = paddle.ones([], x.dtype)
         n = paddle.where(n > one_const, n - 1.0, one_const)
     n = n**0.5
     n.stop_gradient = True
-    out = paddle.sum(paddle.pow((x - u) / n, 2),
-                     axis=axis,
-                     keepdim=keepdim,
-                     name=name)
+    out = paddle.sum(paddle.pow((x - u) / n, 2), axis=axis, keepdim=keepdim, name=name)
     return out
 
 
@@ -94,11 +98,12 @@ def var_mean(x, axis=-1, keepdim=True, unbiased=True, correction=None):
 
 
 def self_attn_forward(
-        self,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        **cross_attention_kwargs, ):
+    self,
+    hidden_states,
+    encoder_hidden_states=None,
+    attention_mask=None,
+    **cross_attention_kwargs,
+):
     attn_output = None
 
     if getattr(self, "enable_attn", False):
@@ -114,31 +119,34 @@ def self_attn_forward(
                 hidden_states=image_hidden_states,
                 encoder_hidden_states=image_hidden_states,
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
 
             latent_self_attn1_uc = self.processor(
                 self,
                 latent_hidden_states,
                 encoder_hidden_states=paddle.concat(
-                    [latent_hidden_states] + image_hidden_states.split(
-                        [chunk_num] *
-                        (image_hidden_states.shape[0] // chunk_num)),
-                    axis=1, ),
+                    [latent_hidden_states]
+                    + image_hidden_states.split([chunk_num] * (image_hidden_states.shape[0] // chunk_num)),
+                    axis=1,
+                ),
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
 
             if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
                 latent_self_attn1_c = latent_self_attn1_uc.clone()
                 latent_self_attn1_c[self.current_uc_indices] = self.processor(
                     self,
                     hidden_states=latent_hidden_states[self.current_uc_indices],
-                    encoder_hidden_states=latent_hidden_states[
-                        self.current_uc_indices],
+                    encoder_hidden_states=latent_hidden_states[self.current_uc_indices],
                     attention_mask=attention_mask,
-                    **cross_attention_kwargs, )
+                    **cross_attention_kwargs,
+                )
                 latent_self_attn1 = (
-                    self.current_style_fidelity * latent_self_attn1_c +
-                    (1.0 - self.current_style_fidelity) * latent_self_attn1_uc)
+                    self.current_style_fidelity * latent_self_attn1_c
+                    + (1.0 - self.current_style_fidelity) * latent_self_attn1_uc
+                )
             else:
                 latent_self_attn1 = latent_self_attn1_uc
 
@@ -150,25 +158,28 @@ def self_attn_forward(
             hidden_states=hidden_states,
             encoder_hidden_states=encoder_hidden_states,
             attention_mask=attention_mask,
-            **cross_attention_kwargs, )
+            **cross_attention_kwargs,
+        )
     return attn_output
 
 
 def transformer_2d_model_forward(
-        self,
-        hidden_states,
-        encoder_hidden_states=None,
-        timestep=None,
-        class_labels=None,
-        cross_attention_kwargs=None,
-        return_dict: bool=True, ):
+    self,
+    hidden_states,
+    encoder_hidden_states=None,
+    timestep=None,
+    class_labels=None,
+    cross_attention_kwargs=None,
+    return_dict: bool = True,
+):
     x = self.original_forward(
         hidden_states,
         encoder_hidden_states=encoder_hidden_states,
         timestep=timestep,
         class_labels=class_labels,
         cross_attention_kwargs=cross_attention_kwargs,
-        return_dict=return_dict, )[0]
+        return_dict=return_dict,
+    )[0]
     output = None
     if getattr(self, "enable_gn", False):
         if self.gn_auto_machine_weight > self.gn_weight:
@@ -177,26 +188,20 @@ def transformer_2d_model_forward(
 
             latent_hidden_states = x[:chunk_num]  # uc, c
             image_hidden_states = x[chunk_num:]  # uc, c
-            image_var, image_mean = var_mean(
-                image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            var, mean = var_mean(
-                latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5
+            image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+            var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+            std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
 
             div_num = image_hidden_states.shape[0] // chunk_num
             mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
             var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
 
-            std_acc = paddle.maximum(var_acc,
-                                     paddle.zeros_like(var_acc) + EPS)**0.5
+            std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
             y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
             if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
                 y_c = y_uc.clone()
-                y_c[self.current_uc_indices] = latent_hidden_states[
-                    self.current_uc_indices]
-                latent_hidden_states = (
-                    self.current_style_fidelity * y_c +
-                    (1.0 - self.current_style_fidelity) * y_uc)
+                y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
+                latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
             else:
                 latent_hidden_states = y_uc
             output = paddle.concat([latent_hidden_states, image_hidden_states])
@@ -204,7 +209,7 @@ def transformer_2d_model_forward(
     if output is None:
         output = x
     if not return_dict:
-        return (output, )
+        return (output,)
 
     return Transformer2DModelOutput(sample=output)
 
@@ -219,26 +224,20 @@ def resnet_block_2d_forward(self, input_tensor, temb):
 
             latent_hidden_states = x[:chunk_num]  # uc, c
             image_hidden_states = x[chunk_num:]  # uc, c
-            image_var, image_mean = var_mean(
-                image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            var, mean = var_mean(
-                latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5
+            image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+            var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+            std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
 
             div_num = image_hidden_states.shape[0] // chunk_num
             mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
             var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
 
-            std_acc = paddle.maximum(var_acc,
-                                     paddle.zeros_like(var_acc) + EPS)**0.5
+            std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
             y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
             if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
                 y_c = y_uc.clone()
-                y_c[self.current_uc_indices] = latent_hidden_states[
-                    self.current_uc_indices]
-                latent_hidden_states = (
-                    self.current_style_fidelity * y_c +
-                    (1.0 - self.current_style_fidelity) * y_uc)
+                y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
+                latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
             else:
                 latent_hidden_states = y_uc
             output = paddle.concat([latent_hidden_states, image_hidden_states])
@@ -259,26 +258,20 @@ def upsample_2d_forward(self, hidden_states, output_size=None):
 
             latent_hidden_states = x[:chunk_num]  # uc, c
             image_hidden_states = x[chunk_num:]  # uc, c
-            image_var, image_mean = var_mean(
-                image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            var, mean = var_mean(
-                latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5
+            image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+            var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
+            std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
 
             div_num = image_hidden_states.shape[0] // chunk_num
             mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
             var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
 
-            std_acc = paddle.maximum(var_acc,
-                                     paddle.zeros_like(var_acc) + EPS)**0.5
+            std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
             y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
             if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
                 y_c = y_uc.clone()
-                y_c[self.current_uc_indices] = latent_hidden_states[
-                    self.current_uc_indices]
-                latent_hidden_states = (
-                    self.current_style_fidelity * y_c +
-                    (1.0 - self.current_style_fidelity) * y_uc)
+                y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
+                latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
             else:
                 latent_hidden_states = y_uc
             output = paddle.concat([latent_hidden_states, image_hidden_states])
@@ -316,26 +309,16 @@ def preprocess(image, resize_mode, width, height):
     if isinstance(image, paddle.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
-        image = resize_image(
-            resize_mode=resize_mode, im=image, width=width, height=height)
+        image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height)
         image = [image]
 
     if isinstance(image[0], PIL.Image.Image):
-        image = [
-            resize_image(
-                resize_mode=resize_mode, im=im, width=width, height=height)
-            for im in image
-        ]
+        image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image]
 
         w, h = image[0].size
-        w, h = map(lambda x: x - x % 8,
-                   (w, h))  # resize to integer multiple of 8
+        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -382,8 +365,7 @@ def resize(im, w, h):
 
         resized = resize(im, src_w, src_h)
         res = Image.new("RGB", (width, height))
-        res.paste(
-            resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
 
     else:
         ratio = width / height
@@ -394,31 +376,22 @@ def resize(im, w, h):
 
         resized = resize(im, src_w, src_h)
         res = Image.new("RGB", (width, height))
-        res.paste(
-            resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
 
         if ratio < src_ratio:
             fill_height = height // 2 - src_h // 2
+            res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
             res.paste(
-                resized.resize(
-                    (width, fill_height), box=(0, 0, width, 0)),
-                box=(0, 0))
-            res.paste(
-                resized.resize(
-                    (width, fill_height),
-                    box=(0, resized.height, width, resized.height)),
-                box=(0, fill_height + src_h), )
+                resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+                box=(0, fill_height + src_h),
+            )
         elif ratio > src_ratio:
             fill_width = width // 2 - src_w // 2
+            res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
             res.paste(
-                resized.resize(
-                    (fill_width, height), box=(0, 0, 0, height)),
-                box=(0, 0))
-            res.paste(
-                resized.resize(
-                    (fill_width, height),
-                    box=(resized.width, 0, resized.width, height)),
-                box=(fill_width + src_w, 0), )
+                resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+                box=(fill_width + src_w, 0),
+            )
 
     return res
 
@@ -454,37 +427,33 @@ class ReferenceOnlyPipeline(DiffusionPipeline):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -492,11 +461,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -517,12 +482,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -533,12 +496,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -550,21 +510,23 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
         self.attn_modules = None
         self.gn_modules = None
 
     def set_reference_only(
-            self,
-            attention_auto_machine_weight=1.0,
-            gn_auto_machine_weight=1.0,
-            current_style_fidelity=0.5,
-            enable_attn=True,
-            enable_gn=True,
-            do_classifier_free_guidance=True, ):
+        self,
+        attention_auto_machine_weight=1.0,
+        gn_auto_machine_weight=1.0,
+        current_style_fidelity=0.5,
+        enable_attn=True,
+        enable_gn=True,
+        do_classifier_free_guidance=True,
+    ):
         assert 0.0 <= attention_auto_machine_weight <= 1.0
         assert 0.0 <= gn_auto_machine_weight <= 2.0
         assert 0.0 <= current_style_fidelity <= 1.0
@@ -574,18 +536,14 @@ def set_reference_only(
                 module.enable_attn = enable_attn
                 module.attention_auto_machine_weight = attention_auto_machine_weight
                 module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [
-                    0
-                ] if do_classifier_free_guidance else []
+                module.current_uc_indices = [0] if do_classifier_free_guidance else []
 
         if self.gn_modules is not None:
             for module in self.gn_modules:
                 module.enable_gn = enable_gn
                 module.gn_auto_machine_weight = gn_auto_machine_weight
                 module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [
-                    0
-                ] if do_classifier_free_guidance else []
+                module.current_uc_indices = [0] if do_classifier_free_guidance else []
 
         # init attn_modules
         if self.attn_modules is None:
@@ -599,75 +557,54 @@ def set_reference_only(
                     hidden_size = self.unet.config.block_out_channels[-1]
                 elif name.startswith("up_blocks"):
                     block_id = int(name[len("up_blocks.")])
-                    hidden_size = list(
-                        reversed(self.unet.config.block_out_channels))[block_id]
+                    hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
                 elif name.startswith("down_blocks"):
                     block_id = int(name[len("down_blocks.")])
                     hidden_size = self.unet.config.block_out_channels[block_id]
                 self_attn_processors_keys.append([name, hidden_size])
 
             # sorted by (-hidden_size, name)，down -> mid -> up.
-            for i, (name, _) in enumerate(
-                    sorted(
-                        self_attn_processors_keys,
-                        key=lambda x: (-x[1], x[0]))):
+            for i, (name, _) in enumerate(sorted(self_attn_processors_keys, key=lambda x: (-x[1], x[0]))):
                 module = self.unet.get_sublayer(name)
-                module.attn_weight = float(i) / float(
-                    len(self_attn_processors_keys))
+                module.attn_weight = float(i) / float(len(self_attn_processors_keys))
 
                 module.enable_attn = enable_attn
                 module.attention_auto_machine_weight = attention_auto_machine_weight
                 module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [
-                    0
-                ] if do_classifier_free_guidance else []
+                module.current_uc_indices = [0] if do_classifier_free_guidance else []
 
                 attn_modules.append(module)
             self.attn_modules = attn_modules
 
         # init gn_modules
         if self.gn_modules is None:
-            gn_modules = [self.unet.mid_block.attentions[-1], ]
-            self.unet.mid_block.attentions[
-                -1].gn_weight = 0.0  # mid             0.0
+            gn_modules = [
+                self.unet.mid_block.attentions[-1],
+            ]
+            self.unet.mid_block.attentions[-1].gn_weight = 0.0  # mid             0.0
 
             input_block_names = [
-                ("down_blocks.1.resnets.0",
-                 "down_blocks.1.attentions.0"),  # 4   2.0
-                ("down_blocks.1.resnets.1",
-                 "down_blocks.1.attentions.1"),  # 5   1.66
-                ("down_blocks.2.resnets.0",
-                 "down_blocks.2.attentions.0"),  # 7   1.33
-                ("down_blocks.2.resnets.1",
-                 "down_blocks.2.attentions.1"),  # 8   1.0
-                ("down_blocks.3.resnets.0",
-                 ),  # 10                               0.66
-                ("down_blocks.3.resnets.1",
-                 ),  # 11                               0.33
+                ("down_blocks.1.resnets.0", "down_blocks.1.attentions.0"),  # 4   2.0
+                ("down_blocks.1.resnets.1", "down_blocks.1.attentions.1"),  # 5   1.66
+                ("down_blocks.2.resnets.0", "down_blocks.2.attentions.0"),  # 7   1.33
+                ("down_blocks.2.resnets.1", "down_blocks.2.attentions.1"),  # 8   1.0
+                ("down_blocks.3.resnets.0",),  # 10                               0.66
+                ("down_blocks.3.resnets.1",),  # 11                               0.33
             ]
             for w, block_names in enumerate(input_block_names):
                 module = self.unet.get_sublayer(block_names[-1])
-                module.gn_weight = 1.0 - float(w) / float(
-                    len(input_block_names))
+                module.gn_weight = 1.0 - float(w) / float(len(input_block_names))
                 gn_modules.append(module)
 
             output_block_names = [
-                ("up_blocks.0.resnets.0",
-                 ),  # 0                                 0.0
-                ("up_blocks.0.resnets.1",
-                 ),  # 1                                 0.25
-                ("up_blocks.0.resnets.2",
-                 "up_blocks.0.upsamplers.0"),  # 2      0.5
-                ("up_blocks.1.resnets.0",
-                 "up_blocks.1.attentions.0"),  # 3      0.75
-                ("up_blocks.1.resnets.1",
-                 "up_blocks.1.attentions.1"),  # 4      1.0
-                ("up_blocks.1.resnets.2",
-                 "up_blocks.1.attentions.2"),  # 5      1.25
-                ("up_blocks.2.resnets.0",
-                 "up_blocks.2.attentions.0"),  # 6      1.5
-                ("up_blocks.2.resnets.1",
-                 "up_blocks.2.attentions.1"),  # 7      1.75
+                ("up_blocks.0.resnets.0",),  # 0                                 0.0
+                ("up_blocks.0.resnets.1",),  # 1                                 0.25
+                ("up_blocks.0.resnets.2", "up_blocks.0.upsamplers.0"),  # 2      0.5
+                ("up_blocks.1.resnets.0", "up_blocks.1.attentions.0"),  # 3      0.75
+                ("up_blocks.1.resnets.1", "up_blocks.1.attentions.1"),  # 4      1.0
+                ("up_blocks.1.resnets.2", "up_blocks.1.attentions.2"),  # 5      1.25
+                ("up_blocks.2.resnets.0", "up_blocks.2.attentions.0"),  # 6      1.5
+                ("up_blocks.2.resnets.1", "up_blocks.2.attentions.1"),  # 7      1.75
             ]
             for w, block_names in enumerate(output_block_names):
                 module = self.unet.get_sublayer(block_names[-1])
@@ -679,20 +616,19 @@ def set_reference_only(
                 module.enable_gn = enable_gn
                 module.gn_auto_machine_weight = gn_auto_machine_weight
                 module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [
-                    0
-                ] if do_classifier_free_guidance else []
+                module.current_uc_indices = [0] if do_classifier_free_guidance else []
 
             self.gn_modules = gn_modules
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -728,29 +664,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -758,8 +696,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -769,14 +706,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -786,46 +725,42 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -844,53 +779,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -903,17 +834,19 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -934,12 +867,13 @@ def prepare_latents(
         return latents
 
     def prepare_image_latents(
-            self,
-            image,
-            batch_size,
-            dtype,
-            generator=None,
-            do_classifier_free_guidance=False, ):
+        self,
+        image,
+        batch_size,
+        dtype,
+        generator=None,
+        do_classifier_free_guidance=False,
+    ):
         if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -948,8 +882,7 @@ def prepare_image_latents(
 
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(init_latents, axis=0)
         else:
@@ -965,33 +898,32 @@ def prepare_image_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[PIL.Image.Image, List[PIL.Image.Image],
-                         paddle.Tensor]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            # reference
-            control_name: str="reference_only",  # "none", "reference_only", "reference_adain", "reference_adain+attn"
-            attention_auto_machine_weight: float=1.0,
-            gn_auto_machine_weight: float=1.0,
-            current_style_fidelity: float=0.5,
-            resize_mode: int=-1, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        # reference
+        control_name: str = "reference_only",  # "none", "reference_only", "reference_adain", "reference_adain+attn"
+        attention_auto_machine_weight: float = 1.0,
+        gn_auto_machine_weight: float = 1.0,
+        current_style_fidelity: float = 0.5,
+        resize_mode: int = -1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -1079,7 +1011,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -1101,7 +1034,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -1118,55 +1052,57 @@ def __call__(
             width,
             dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. reference_only
-        enable_attn = ("only" in control_name or "attn" in control_name and
-                       image is not None and attention_auto_machine_weight > 0)
-        enable_gn = ("adain" in control_name and image is not None and
-                     gn_auto_machine_weight > 0)
+        enable_attn = (
+            "only" in control_name
+            or "attn" in control_name
+            and image is not None
+            and attention_auto_machine_weight > 0
+        )
+        enable_gn = "adain" in control_name and image is not None and gn_auto_machine_weight > 0
         self.set_reference_only(
             attention_auto_machine_weight,
             gn_auto_machine_weight,
             current_style_fidelity,
             enable_attn,
             enable_gn,
-            do_classifier_free_guidance, )
+            do_classifier_free_guidance,
+        )
 
         if enable_attn or enable_gn:
             image = preprocess(image, resize_mode, width, height)
             image_latents = self.prepare_image_latents(
-                image, batch_size, dtype, generator,
-                do_classifier_free_guidance)
+                image, batch_size, dtype, generator, do_classifier_free_guidance
+            )
             prompt_embeds = prompt_embeds.tile([1 + image.shape[0], 1, 1])
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 if enable_attn or enable_gn:
-                    image_noise = randn_tensor(
-                        image_latents.shape, generator=generator, dtype=dtype)
+                    image_noise = randn_tensor(image_latents.shape, generator=generator, dtype=dtype)
                     image_latent_model_input = self.scheduler.scale_model_input(
-                        self.scheduler.add_noise(image_latents, image_noise, t),
-                        t)
+                        self.scheduler.add_noise(image_latents, image_noise, t), t
+                    )
                     chunk_num = 2 if do_classifier_free_guidance else 1
                     noise_pred = self.unet(
-                        paddle.concat([
-                            latent_model_input,
-                            image_latent_model_input.cast(
-                                latent_model_input.dtype),
-                        ]),
+                        paddle.concat(
+                            [
+                                latent_model_input,
+                                image_latent_model_input.cast(latent_model_input.dtype),
+                            ]
+                        ),
                         t,
                         encoder_hidden_states=prompt_embeds,
                         cross_attention_kwargs=cross_attention_kwargs,
@@ -1176,22 +1112,19 @@ def __call__(
                         latent_model_input,
                         t,
                         encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs, ).sample
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -1204,8 +1137,7 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 10. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 11. Convert to PIL
             image = self.numpy_to_pil(image)
@@ -1214,11 +1146,9 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 10. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
index 5d0dc0e26b395..25e821228b061 100644
--- a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
+++ b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
@@ -19,23 +19,27 @@
 import numpy as np
 import paddle
 import PIL.Image
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers.image_processor import VaeImageProcessor
 from ppdiffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ppdiffusers.models import (AutoencoderKL, ControlNetModel,
-                                UNet2DConditionModel)
+from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import \
-    MultiControlNetModel
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import (
+    MultiControlNetModel,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (check_min_version, deprecate, logging,
-                               randn_tensor, replace_example_docstring)
+from ppdiffusers.utils import (
+    check_min_version,
+    deprecate,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 
 check_min_version("0.16.1")
 
@@ -88,8 +92,7 @@
 """
 
 
-class StableDiffusionControlNetImg2ImgPipeline(
-        DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
 
@@ -126,17 +129,22 @@ class StableDiffusionControlNetImg2ImgPipeline(
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[
-                ControlNetModel], MultiControlNetModel, ],
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[
+            ControlNetModel,
+            List[ControlNetModel],
+            Tuple[ControlNetModel],
+            MultiControlNetModel,
+        ],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -166,25 +174,27 @@ def __init__(
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.control_image_processor = VaeImageProcessor(
             vae_scale_factor=self.vae_scale_factor,
             do_convert_rgb=True,
-            do_normalize=False, )
+            do_normalize=False,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            lora_scale: Optional[float]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -231,32 +241,36 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            config = (self.text_encoder.config
-                      if isinstance(self.text_encoder.config, dict) else
-                      self.text_encoder.config.to_dict())
-            if (config.get("use_attention_mask", None) is not None and
-                    config["use_attention_mask"]):
+            config = (
+                self.text_encoder.config
+                if isinstance(self.text_encoder.config, dict)
+                else self.text_encoder.config.to_dict()
+            )
+            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
@@ -264,33 +278,32 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(
-                    negative_prompt):
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -298,39 +311,38 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            config = (self.text_encoder.config
-                      if isinstance(self.text_encoder.config, dict) else
-                      self.text_encoder.config.to_dict())
-            if (config.get("use_attention_mask", None) is not None and
-                    config["use_attention_mask"]):
+            config = (
+                self.text_encoder.config
+                if isinstance(self.text_encoder.config, dict)
+                else self.text_encoder.config.to_dict()
+            )
+            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                dtype=self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -339,16 +351,13 @@ def run_safety_checker(self, image, dtype):
             has_nsfw_concept = None
         else:
             if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(
-                    image, output_type="pil")
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
             else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(
-                    image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="pd")
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         return image, has_nsfw_concept
 
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -357,48 +366,46 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None,
-            controlnet_conditioning_scale=1.0, ):
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -411,7 +418,8 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         # `prompt` needs more sophisticated handling when there are multiple
         # conditionings.
@@ -426,15 +434,12 @@ def check_inputs(
             self.check_image(image, prompt, prompt_embeds)
         elif isinstance(self.controlnet, MultiControlNetModel):
             if not isinstance(image, list):
-                raise TypeError(
-                    "For multiple controlnets: `image` must be type `list`")
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
 
             # When `image` is a nested list:
             # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
             elif any(isinstance(i, list) for i in image):
-                raise ValueError(
-                    "A single batch of multiple conditionings are supported at the moment."
-                )
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
             elif len(image) != len(self.controlnet.nets):
                 raise ValueError(
                     f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
@@ -448,22 +453,18 @@ def check_inputs(
         # Check `controlnet_conditioning_scale`
         if isinstance(self.controlnet, ControlNetModel):
             if not isinstance(controlnet_conditioning_scale, float):
-                raise TypeError(
-                    "For single controlnet: `controlnet_conditioning_scale` must be type `float`."
-                )
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
         elif isinstance(self.controlnet, MultiControlNetModel):
             if isinstance(controlnet_conditioning_scale, list):
-                if any(
-                        isinstance(i, list)
-                        for i in controlnet_conditioning_scale):
-                    raise ValueError(
-                        "A single batch of multiple conditionings are supported at the moment."
-                    )
-            elif isinstance(controlnet_conditioning_scale, list) and len(
-                    controlnet_conditioning_scale) != len(self.controlnet.nets):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
                 raise ValueError(
                     "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                    " the same length as the number of controlnets")
+                    " the same length as the number of controlnets"
+                )
         else:
             assert False
 
@@ -471,16 +472,18 @@ def check_image(self, image, prompt, prompt_embeds):
         image_is_pil = isinstance(image, PIL.Image.Image)
         image_is_tensor = isinstance(image, paddle.Tensor)
         image_is_np = isinstance(image, np.ndarray)
-        image_is_pil_list = isinstance(image, list) and isinstance(
-            image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(
-            image[0], paddle.Tensor)
-        image_is_np_list = isinstance(image, list) and isinstance(image[0],
-                                                                  np.ndarray)
-
-        if (not image_is_pil and not image_is_tensor and not image_is_np and
-                not image_is_pil_list and not image_is_tensor_list and
-                not image_is_np_list):
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
             raise TypeError(
                 f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
             )
@@ -503,17 +506,17 @@ def check_image(self, image, prompt, prompt_embeds):
             )
 
     def prepare_control_image(
-            self,
-            image,
-            width,
-            height,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            do_classifier_free_guidance=False,
-            guess_mode=False, ):
-        image = self.control_image_processor.preprocess(
-            image, height=height, width=width).cast(dtype=paddle.float32)
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
         image_batch_size = image.shape[0]
 
         if image_batch_size == 1:
@@ -533,21 +536,14 @@ def prepare_control_image(
 
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        num_images_per_prompt,
-                        dtype,
-                        generator=None):
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -569,18 +565,15 @@ def prepare_latents(self,
 
             elif isinstance(generator, list):
                 init_latents = [
-                    self.vae.encode(image[i:i + 1]).latent_dist.sample(
-                        generator[i]) for i in range(batch_size)
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
                 ]
                 init_latents = paddle.concat(init_latents, axis=0)
             else:
-                init_latents = self.vae.encode(image).latent_dist.sample(
-                    generator)
+                init_latents = self.vae.encode(image).latent_dist.sample(generator)
 
             init_latents = self.vae.config.scaling_factor * init_latents
 
-        if (batch_size > init_latents.shape[0] and
-                batch_size % init_latents.shape[0] == 0):
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
             deprecation_message = (
                 f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -592,12 +585,11 @@ def prepare_latents(self,
                 "len(prompt) != len(image)",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat(
-                [init_latents] * additional_image_per_prompt, axis=0)
-        elif (batch_size > init_latents.shape[0] and
-              batch_size % init_latents.shape[0] != 0):
+            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -616,33 +608,44 @@ def prepare_latents(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray, List[
-                paddle.Tensor], List[PIL.Image.Image], List[np.ndarray], ]=None,
-            control_image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray,
-                                 List[paddle.Tensor], List[
-                                     PIL.Image.Image], List[np.ndarray], ]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_conditioning_scale: Union[float, List[float]]=0.8,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            paddle.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[paddle.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        control_image: Union[
+            paddle.Tensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[paddle.Tensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -740,7 +743,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            controlnet_conditioning_scale, )
+            controlnet_conditioning_scale,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -757,20 +761,20 @@ def __call__(
 
         controlnet = self.controlnet
 
-        if isinstance(controlnet, MultiControlNetModel) and isinstance(
-                controlnet_conditioning_scale, float):
-            controlnet_conditioning_scale = [controlnet_conditioning_scale
-                                             ] * len(controlnet.nets)
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
 
         global_pool_conditions = (
             controlnet.config.global_pool_conditions
-            if isinstance(controlnet, ControlNetModel) else
-            controlnet.nets[0].config.global_pool_conditions)
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
         guess_mode = guess_mode or global_pool_conditions
 
         # 3. Encode input prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
         prompt_embeds = self._encode_prompt(
             prompt,
             num_images_per_prompt,
@@ -778,10 +782,10 @@ def __call__(
             negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale, )
+            lora_scale=text_encoder_lora_scale,
+        )
         # 4. Prepare image
-        image = self.image_processor.preprocess(image).cast(
-            dtype=paddle.float32)
+        image = self.image_processor.preprocess(image).cast(dtype=paddle.float32)
 
         # 5. Prepare controlnet_conditioning_image
         if isinstance(controlnet, ControlNetModel):
@@ -793,7 +797,8 @@ def __call__(
                 num_images_per_prompt=num_images_per_prompt,
                 dtype=controlnet.dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         elif isinstance(controlnet, MultiControlNetModel):
             control_images = []
 
@@ -806,7 +811,8 @@ def __call__(
                     num_images_per_prompt=num_images_per_prompt,
                     dtype=controlnet.dtype,
                     do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode, )
+                    guess_mode=guess_mode,
+                )
 
                 control_images.append(control_image_)
 
@@ -815,11 +821,11 @@ def __call__(
             assert False
 
         # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, )
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        self.scheduler.set_timesteps(
+            num_inference_steps,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
         latents = self.prepare_latents(
@@ -828,28 +834,25 @@ def __call__(
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # controlnet(s) inference
                 if guess_mode and do_classifier_free_guidance:
                     # Infer ControlNet only for the conditional batch.
                     control_model_input = latents
-                    control_model_input = self.scheduler.scale_model_input(
-                        control_model_input, t)
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
                     controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
                 else:
                     control_model_input = latent_model_input
@@ -862,20 +865,17 @@ def __call__(
                     controlnet_cond=control_image,
                     conditioning_scale=controlnet_conditioning_scale,
                     guess_mode=guess_mode,
-                    return_dict=False, )
+                    return_dict=False,
+                )
 
                 if guess_mode and do_classifier_free_guidance:
                     # Infered ControlNet only for the conditional batch.
                     # To apply the output of ControlNet to both the unconditional and conditional batches,
                     # add 0 to the unconditional batch to keep it unchanged.
-                    down_block_res_samples = [
-                        paddle.concat([paddle.zeros_like(d), d])
-                        for d in down_block_res_samples
-                    ]
-                    mid_block_res_sample = paddle.concat([
-                        paddle.zeros_like(mid_block_res_sample),
-                        mid_block_res_sample
-                    ])
+                    down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = paddle.concat(
+                        [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
+                    )
 
                 # predict the noise residual
                 noise_pred = self.unet(
@@ -885,35 +885,26 @@ def __call__(
                     cross_attention_kwargs=cross_attention_kwargs,
                     down_block_additional_residuals=down_block_res_samples,
                     mid_block_additional_residual=mid_block_res_sample,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False)[0]
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
-            image = self.vae.decode(
-                latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
         else:
             image = latents
             has_nsfw_concept = None
@@ -923,11 +914,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
index 73eae51ab8e43..420f7c4ee7053 100644
--- a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
+++ b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
@@ -19,18 +19,21 @@
 
 import paddle
 from packaging import version
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
 from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (deprecate, logging, randn_tensor,
-                               replace_example_docstring)
+from ppdiffusers.utils import (
+    deprecate,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -80,37 +83,33 @@ class StableDiffusionHiresFixPipeline(DiffusionPipeline):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -118,11 +117,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -143,12 +138,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -159,12 +152,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -176,18 +166,20 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -223,29 +215,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -253,8 +247,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -264,14 +257,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -281,36 +276,33 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
 
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -319,7 +311,7 @@ def get_timesteps(self, denoising_steps, denoising_strength):
         self.scheduler.set_timesteps(steps)
 
         t_start = max(steps - denoising_steps, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         if hasattr(self.scheduler, "step_index_offset"):
             self.scheduler.step_index_offset = t_start * self.scheduler.order
@@ -328,11 +320,10 @@ def get_timesteps(self, denoising_steps, denoising_strength):
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -351,62 +342,57 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            hr_scale,
-            hr_resize_height,
-            hr_resize_width,
-            denoising_strength,
-            latent_scale_mode,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        hr_scale,
+        hr_resize_height,
+        hr_resize_width,
+        denoising_strength,
+        latent_scale_mode,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if hr_scale < 0:
-            raise ValueError(
-                "hr_scale shoule be greater that 0, but acceived {hr_scale}")
+            raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
 
         if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
             raise ValueError(
@@ -414,9 +400,7 @@ def check_inputs(
             )
 
         if denoising_strength > 1 or denoising_strength < 0:
-            raise ValueError(
-                f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}"
-            )
+            raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -434,17 +418,19 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -464,12 +450,7 @@ def prepare_latents(
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    def get_upscaled_width_and_height(self,
-                                      width,
-                                      height,
-                                      hr_scale=2,
-                                      hr_resize_width=0,
-                                      hr_resize_height=0):
+    def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
         if hr_resize_width == 0 and hr_resize_height == 0:
             hr_upscale_to_width = int(width * hr_scale)
             hr_upscale_to_height = int(height * hr_scale)
@@ -496,32 +477,32 @@ def get_upscaled_width_and_height(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=40,
-            hires_ratio: Optional[float]=0.5,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            enable_hr: Optional[bool]=True,
-            hr_scale: Optional[float]=2.0,
-            hr_resize_width: Optional[int]=0,
-            hr_resize_height: Optional[int]=0,
-            denoising_strength: Optional[float]=0.7,
-            latent_scale_mode: Optional[str]="nearest", ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 40,
+        hires_ratio: Optional[float] = 0.5,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        enable_hr: Optional[bool] = True,
+        hr_scale: Optional[float] = 2.0,
+        hr_resize_width: Optional[int] = 0,
+        hr_resize_height: Optional[int] = 0,
+        denoising_strength: Optional[float] = 0.7,
+        latent_scale_mode: Optional[str] = "nearest",
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -622,7 +603,8 @@ def __call__(
             latent_scale_mode,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -644,7 +626,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         if enable_hr:
@@ -660,11 +643,9 @@ def __call__(
         # 5. Prepare latent variables
         if generator is None:
             generator_state = paddle.get_cuda_rng_state()
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(
-                generator_state)
+            paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
         else:
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(
-                paddle.Generator().states_[generator])
+            paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
 
         num_channels_latents = self.unet.in_channels
         latents = self.prepare_latents(
@@ -674,7 +655,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -684,32 +666,27 @@ def __call__(
         with self.progress_bar(total=sample_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -719,85 +696,74 @@ def __call__(
             # 8. determine the upscaled width and height for upscaled images
             truncate_width = 0
             truncate_height = 0
-            (
-                self.hr_upscale_to_width,
-                self.hr_upscale_to_height,
-            ) = self.get_upscaled_width_and_height(
+            (self.hr_upscale_to_width, self.hr_upscale_to_height,) = self.get_upscaled_width_and_height(
                 width,
                 height,
                 hr_scale=hr_scale,
                 hr_resize_width=hr_resize_width,
-                hr_resize_height=hr_resize_height, )
+                hr_resize_height=hr_resize_height,
+            )
             if hr_resize_width != 0 and hr_resize_height != 0:
-                truncate_width = (self.hr_upscale_to_width - hr_resize_width
-                                  ) // self.vae_scale_factor
-                truncate_height = (self.hr_upscale_to_height - hr_resize_height
-                                   ) // self.vae_scale_factor
+                truncate_width = (self.hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
+                truncate_height = (self.hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
 
             # 9. special case: do nothing if upscaling is not nesscessary
-            if (self.hr_upscale_to_width == width and
-                    self.hr_upscale_to_height == height):
+            if self.hr_upscale_to_width == width and self.hr_upscale_to_height == height:
                 enable_hr = False
                 denoising_strength = None
 
         if enable_hr:
             # 10. prepare init latents
-            timesteps, hr_steps = self.get_timesteps(hr_steps,
-                                                     denoising_strength)
+            timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength)
             init_timestep = timesteps[:1].tile([latents.shape[0]])
 
             latents = paddle.nn.functional.interpolate(
                 latents,
                 size=(
                     self.hr_upscale_to_height // self.vae_scale_factor,
-                    self.hr_upscale_to_width // self.vae_scale_factor, ),
-                mode=latent_scale_mode, )
-            latents = latents[:, :, truncate_height // 2:latents.shape[2] - (
-                truncate_height + 1) // 2, truncate_width // 2:latents.shape[3]
-                              - (truncate_width + 1) // 2, ]
-
-            noise = randn_tensor(
-                latents.shape,
-                dtype=latents.dtype,
-                generator="initial_generator")
+                    self.hr_upscale_to_width // self.vae_scale_factor,
+                ),
+                mode=latent_scale_mode,
+            )
+            latents = latents[
+                :,
+                :,
+                truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
+                truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
+            ]
+
+            noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
             latents = self.scheduler.add_noise(latents, noise, init_timestep)
 
             # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(
-                "initial_generator", eta)
+            extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
 
             # 12. denoising on hires.fix steps
             num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
             with self.progress_bar(total=hr_steps) as progress_bar:
                 for i, t in enumerate(timesteps):
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat([latents] * 2)
-                                          if do_classifier_free_guidance else
-                                          latents)
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                     # predict the noise residual
                     noise_pred = self.unet(
                         latent_model_input,
                         t,
                         encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs, ).sample
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    ).sample
 
                     # perform guidance
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (
-                            noise_pred_text - noise_pred_uncond)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                     # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                     # call the callback, if provided
-                    if i == len(timesteps) - 1 or (
-                        (i + 1) > num_warmup_steps and
-                        (i + 1) % self.scheduler.order == 0):
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                         progress_bar.update()
                         if callback is not None and i % callback_steps == 0:
                             callback(i, t, latents)
@@ -808,16 +774,13 @@ def __call__(
             has_nsfw_concept = None
         elif output_type == "pil":
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
             image = self.numpy_to_pil(image)
         else:
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_mega.py b/ppdiffusers/examples/community/stable_diffusion_mega.py
index ba2adb2a179ec..71ff024d88b08 100644
--- a/ppdiffusers/examples/community/stable_diffusion_mega.py
+++ b/ppdiffusers/examples/community/stable_diffusion_mega.py
@@ -21,30 +21,44 @@
 import paddle
 import PIL
 import PIL.Image
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import (
-    AutoencoderKL, ControlNetModel, DDIMScheduler, DDPMScheduler,
-    DEISMultistepScheduler, DiffusionPipeline, DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler, EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler, HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel,
-    UniPCMultistepScheduler)
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.loaders import (FromCkptMixin, LoraLoaderMixin,
-                                 TextualInversionLoaderMixin)
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
+from ppdiffusers.loaders import (
+    FromCkptMixin,
+    LoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from ppdiffusers.pipelines.stable_diffusion.pipeline_cycle_diffusion import (
-    compute_noise, posterior_sample)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+    compute_noise,
+    posterior_sample,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (PIL_INTERPOLATION, deprecate, logging,
-                               randn_tensor)
+from ppdiffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -66,7 +80,8 @@
 [^\\()\[\]:]+|
 :
 """,
-    re.X, )
+    re.X,
+)
 
 
 def parse_prompt_attention(text):
@@ -185,32 +200,20 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
         tokens.append(text_token)
         weights.append(text_weight)
     if truncated:
-        logger.warning(
-            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
-        )
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
 
 
-def pad_tokens_and_weights(tokens,
-                           weights,
-                           max_length,
-                           bos,
-                           eos,
-                           pad,
-                           no_boseos_middle=True,
-                           chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
     r"""
     Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
     """
     max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = (max_length if no_boseos_middle else
-                      max_embeddings_multiples * chunk_length)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
     for i in range(len(tokens)):
-        tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
-                     (max_length - 2 - len(tokens[i])))
+        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
         if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
-                                                       len(weights[i]))
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
         else:
             w = []
             if len(weights[i]) == 0:
@@ -218,8 +221,7 @@ def pad_tokens_and_weights(tokens,
             else:
                 for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2):min(
-                        len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
@@ -228,10 +230,11 @@ def pad_tokens_and_weights(tokens,
 
 
 def get_unweighted_text_embeddings(
-        pipe,
-        text_input: paddle.Tensor,
-        chunk_length: int,
-        no_boseos_middle: Optional[bool]=True, ):
+    pipe,
+    text_input: paddle.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
     it should be split into chunks and sent to the text encoder individually.
@@ -241,8 +244,7 @@ def get_unweighted_text_embeddings(
         text_embeddings = []
         for i in range(max_embeddings_multiples):
             # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
-                chunk_length - 2) + 2].clone()
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
 
             # cover the head and the tail by the starting and the ending tokens
             text_input_chunk[:, 0] = text_input[0, 0]
@@ -268,14 +270,15 @@ def get_unweighted_text_embeddings(
 
 
 def get_weighted_text_embeddings(
-        pipe,
-        prompt: Union[str, List[str]],
-        uncond_prompt: Optional[Union[str, List[str]]]=None,
-        max_embeddings_multiples: Optional[int]=1,
-        no_boseos_middle: Optional[bool]=False,
-        skip_parsing: Optional[bool]=False,
-        skip_weighting: Optional[bool]=False,
-        **kwargs, ):
+    pipe,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 1,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
+):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
     prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -299,24 +302,19 @@ def get_weighted_text_embeddings(
         skip_weighting (`bool`, *optional*, defaults to `False`):
             Skip the weighting. When the parsing is skipped, it is forced True.
     """
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     if isinstance(prompt, str):
         prompt = [prompt]
 
     if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
-                                                                 max_length - 2)
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
         if uncond_prompt is not None:
             if isinstance(uncond_prompt, str):
                 uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(
-                pipe, uncond_prompt, max_length - 2)
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
     else:
         prompt_tokens = [
-            token[1:-1]
-            for token in pipe.tokenizer(
-                prompt, max_length=max_length, truncation=True).input_ids
+            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
         ]
         prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
         if uncond_prompt is not None:
@@ -324,33 +322,26 @@ def get_weighted_text_embeddings(
                 uncond_prompt = [uncond_prompt]
             uncond_tokens = [
                 token[1:-1]
-                for token in pipe.tokenizer(
-                    uncond_prompt, max_length=max_length, truncation=True)
-                .input_ids
+                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
             ]
             uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
 
     # round up the longest length of tokens to a multiple of (model_max_length - 2)
     max_length = max([len(token) for token in prompt_tokens])
     if uncond_prompt is not None:
-        max_length = max(max_length,
-                         max([len(token) for token in uncond_tokens]))
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
 
     max_embeddings_multiples = min(
         max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
 
     # pad the length of tokens and weights
     # support bert tokenizer
-    bos = (pipe.tokenizer.bos_token_id
-           if pipe.tokenizer.bos_token_id is not None else
-           pipe.tokenizer.cls_token_id)
-    eos = (pipe.tokenizer.eos_token_id
-           if pipe.tokenizer.eos_token_id is not None else
-           pipe.tokenizer.sep_token_id)
+    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
     pad = pipe.tokenizer.pad_token_id
     prompt_tokens, prompt_weights = pad_tokens_and_weights(
         prompt_tokens,
@@ -360,7 +351,8 @@ def get_weighted_text_embeddings(
         eos,
         pad,
         no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length, )
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
     prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
     if uncond_prompt is not None:
         uncond_tokens, uncond_weights = pad_tokens_and_weights(
@@ -371,7 +363,8 @@ def get_weighted_text_embeddings(
             eos,
             pad,
             no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length, )
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
         uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
 
     # get the embeddings
@@ -379,43 +372,35 @@ def get_weighted_text_embeddings(
         pipe,
         prompt_tokens,
         pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle, )
-    prompt_weights = paddle.to_tensor(
-        prompt_weights, dtype=text_embeddings.dtype)
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
             pipe,
             uncond_tokens,
             pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle, )
-        uncond_weights = paddle.to_tensor(
-            uncond_weights, dtype=uncond_embeddings.dtype)
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype)
 
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
     if (not skip_parsing) and (not skip_weighting):
         previous_mean = text_embeddings.mean(axis=[-2, -1])
         text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (
-            (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1)
-            .unsqueeze(-1))
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
         if uncond_prompt is not None:
             previous_mean = uncond_embeddings.mean(axis=[-2, -1])
             uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (
-                (previous_mean / uncond_embeddings.mean(axis=[-2, -1]))
-                .unsqueeze(-1).unsqueeze(-1))
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
 
     if uncond_prompt is not None:
         return text_embeddings, uncond_embeddings
     return text_embeddings, None
 
 
-def prepare_mask_and_masked_image(image,
-                                  mask,
-                                  height=None,
-                                  width=None,
-                                  return_image: bool=False):
+def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
     """
     Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
     converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -452,14 +437,11 @@ def prepare_mask_and_masked_image(image,
 
     if isinstance(image, paddle.Tensor):
         if not isinstance(mask, paddle.Tensor):
-            raise TypeError(
-                f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
-            )
+            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
 
         # Batch single image
         if image.ndim == 3:
-            assert (image.shape[0] == 3
-                    ), "Image outside a batch should be of shape (3, H, W)"
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
             image = image.unsqueeze(0)
 
         # Batch and add channel dim for single mask
@@ -476,12 +458,9 @@ def prepare_mask_and_masked_image(image,
             else:
                 mask = mask.unsqueeze(1)
 
-        assert (image.ndim == 4 and
-                mask.ndim == 4), "Image and Mask must have 4 dimensions"
-        assert (image.shape[-2:] == mask.shape[-2:]
-                ), "Image and Mask must have the same spatial dimensions"
-        assert (image.shape[0] == mask.shape[0]
-                ), "Image and Mask must have the same batch size"
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
 
         # Check image is in [-1, 1]
         if image.min() < -1 or image.max() > 1:
@@ -498,8 +477,7 @@ def prepare_mask_and_masked_image(image,
         # Image as float32
         image = image.cast(dtype=paddle.float32)
     elif isinstance(mask, paddle.Tensor):
-        raise TypeError(
-            f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
     else:
         # preprocess image
         if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -510,13 +488,8 @@ def prepare_mask_and_masked_image(image,
                 w, h = image[0].size
             else:
                 w, h = width, height
-            w, h = (x - x % 8
-                    for x in (w, h))  # resize to integer multiple of 8
-            image = [
-                i.resize(
-                    (w, h), resample=PIL_INTERPOLATION["lanczos"])
-                for i in image
-            ]
+            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+            image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
             image = [np.array(i.convert("RGB"))[None, :] for i in image]
             image = np.concatenate(image, axis=0)
         elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -535,14 +508,9 @@ def prepare_mask_and_masked_image(image,
                 w, h = mask[0].size
             else:
                 w, h = width, height
-            w, h = (x - x % 8
-                    for x in (w, h))  # resize to integer multiple of 8
-            mask = [
-                i.resize(
-                    (w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask
-            ]
-            mask = np.concatenate(
-                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+            mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
             mask = mask.astype(np.float32) / 255.0
         elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
             mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -563,55 +531,45 @@ def prepare_mask_and_masked_image(image,
 class CommonMixIn:
     @property
     def components(self) -> Dict[str, Any]:
-        return {
-            k: getattr(self, k)
-            for k in self.config.keys() if not k.startswith("_")
-        }
+        return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
 
     def change_scheduler(self, scheduler_type="ddim"):
         scheduler_type = scheduler_type.lower()
         if scheduler_type == "pndm":
-            scheduler = PNDMScheduler.from_config(
-                self.orginal_scheduler_config, skip_prk_steps=True)
+            scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
         elif scheduler_type == "lms":
-            scheduler = LMSDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "heun":
-            scheduler = HeunDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "euler":
-            scheduler = EulerDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "euler-ancestral":
-            scheduler = EulerAncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "dpm-multi":
-            scheduler = DPMSolverMultistepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "dpm-single":
-            scheduler = DPMSolverSinglestepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "kdpm2-ancestral":
-            scheduler = KDPM2AncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "kdpm2":
-            scheduler = KDPM2DiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "unipc-multi":
-            scheduler = UniPCMultistepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "ddim":
             scheduler = DDIMScheduler.from_config(
                 self.orginal_scheduler_config,
                 steps_offset=1,
                 clip_sample=False,
-                set_alpha_to_one=False, )
+                set_alpha_to_one=False,
+            )
         elif scheduler_type == "ddpm":
-            scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config,
-                                                  )
+            scheduler = DDPMScheduler.from_config(
+                self.orginal_scheduler_config,
+            )
         elif scheduler_type == "deis-multi":
             scheduler = DEISMultistepScheduler.from_config(
-                self.orginal_scheduler_config, )
+                self.orginal_scheduler_config,
+            )
         else:
             raise ValueError(
                 f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
@@ -623,11 +581,10 @@ def get_timesteps(self, num_inference_steps, strength=1.0):
             return self.scheduler.timesteps, num_inference_steps
 
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         num_inference_steps = num_inference_steps - t_start
         # check that number of inference steps is not < 1 - as this doesn't make sense
@@ -640,26 +597,26 @@ def get_timesteps(self, num_inference_steps, strength=1.0):
         return timesteps, num_inference_steps
 
     def prepare_controlnet_cond(
-            self,
-            controlnet_cond,
-            controlnet_conditioning_scale,
-            width,
-            height,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            do_classifier_free_guidance=False,
-            guess_mode=False, ):
+        self,
+        controlnet_cond,
+        controlnet_conditioning_scale,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
         control_image = self.control_image_processor.preprocess(
             controlnet_cond,
             height=height,
-            width=width, )
+            width=width,
+        )
         if isinstance(controlnet_conditioning_scale, (float, int)):
-            controlnet_conditioning_scale = paddle.to_tensor(
-                [controlnet_conditioning_scale] * 13, dtype=dtype)
+            controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=dtype)
         elif isinstance(controlnet_conditioning_scale, (list, tuple)):
-            controlnet_conditioning_scale = paddle.to_tensor(
-                controlnet_conditioning_scale, dtype=dtype)
+            controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=dtype)
         else:
             raise ValueError(
                 f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}"
@@ -678,40 +635,40 @@ def prepare_controlnet_cond(
         return control_image, controlnet_conditioning_scale
 
     def check_inputs(
-            self,
-            prompt,
-            height=512,
-            width=512,
-            callback_steps=1,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None,
-            strength=1.0, ):
+        self,
+        prompt,
+        height=512,
+        width=512,
+        callback_steps=1,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        strength=1.0,
+    ):
         if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
             raise ValueError(
                 f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
             )
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -724,24 +681,25 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
     def prepare_latents(
-            self,
-            batch_size,
-            height,
-            width,
-            generator,
-            dtype=None,
-            latents=None,
-            image=None,
-            timestep=None,
-            is_strength_max=True,
-            return_noise=False,
-            return_image_latents=False, ):
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        dtype=None,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
         shape = [
             batch_size,
             self.vae.config.latent_channels,
@@ -762,53 +720,50 @@ def prepare_latents(
 
         if return_image_latents or (latents is None and not is_strength_max):
             image = image.cast(dtype=dtype)
-            image_latents = self._encode_vae_image(
-                image, batch_size=batch_size, generator=generator)
+            image_latents = self._encode_vae_image(image, batch_size=batch_size, generator=generator)
 
         if latents is None:
             noise = randn_tensor(shape, generator=generator, dtype=dtype)
             # if strength is 1. then initialise the latents to noise, else initial to image + noise
-            latents = (noise if is_strength_max else
-                       self.scheduler.add_noise(image_latents, noise, timestep))
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
             # if pure noise then scale the initial latents by the  Scheduler's init sigma
-            latents = (latents * self.scheduler.init_noise_sigma
-                       if is_strength_max else latents)
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
         else:
             noise = latents
             if str(noise.dtype).replace("paddle.", "") != dtype:
                 noise = noise.cast(dtype)
             latents = noise * self.scheduler.init_noise_sigma
 
-        outputs = (latents, )
+        outputs = (latents,)
 
         if return_noise:
-            outputs += (noise, )
+            outputs += (noise,)
 
         if return_image_latents:
-            outputs += (image_latents, )
+            outputs += (image_latents,)
 
         if len(outputs) == 1:
             outputs = latents
         return outputs
 
     def prepare_mask_latents(
-            self,
-            mask,
-            masked_image,
-            batch_size,
-            height,
-            width,
-            generator,
-            dtype,
-            do_classifier_free_guidance=False,
-            return_masked_image_latents=True, ):
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        generator,
+        dtype,
+        do_classifier_free_guidance=False,
+        return_masked_image_latents=True,
+    ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
         mask = paddle.nn.functional.interpolate(
-            mask,
-            size=(height // self.vae_scale_factor,
-                  width // self.vae_scale_factor))
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
         mask = mask.cast(dtype=dtype)
 
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -821,14 +776,12 @@ def prepare_mask_latents(
                 )
             mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
 
-        mask = paddle.concat([mask] *
-                             2) if do_classifier_free_guidance else mask
+        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
         if not return_masked_image_latents:
             return mask
 
         masked_image = masked_image.cast(dtype=dtype)
-        masked_image_latents = self._encode_vae_image(
-            masked_image, batch_size=batch_size, generator=generator)
+        masked_image_latents = self._encode_vae_image(masked_image, batch_size=batch_size, generator=generator)
         if masked_image_latents.shape[0] < batch_size:
             if not batch_size % masked_image_latents.shape[0] == 0:
                 raise ValueError(
@@ -836,31 +789,24 @@ def prepare_mask_latents(
                     f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
                     " Make sure the number of images that you pass is divisible by the total requested batch size."
                 )
-            masked_image_latents = masked_image_latents.tile(
-                [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
 
-        masked_image_latents = (paddle.concat([masked_image_latents] * 2)
-                                if do_classifier_free_guidance else
-                                masked_image_latents)
+        masked_image_latents = (
+            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
 
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.cast(dtype=dtype)
         return mask, masked_image_latents
 
     def is_scheduler_support_step_index(self):
-        kwargs_keys = set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys())
         return "kwargs" in kwargs_keys or "step_index" in kwargs_keys
 
-    def _encode_vae_image(self,
-                          image: paddle.Tensor,
-                          batch_size=1,
-                          generator=None,
-                          **kwargs):
+    def _encode_vae_image(self, image: paddle.Tensor, batch_size=1, generator=None, **kwargs):
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(init_latents, axis=0)
         else:
@@ -868,21 +814,24 @@ def _encode_vae_image(self,
         return self.vae.config.scaling_factor * init_latents
 
     def _decode_vae_latents(self, latents: paddle.Tensor, **kwargs):
-        images_vae = self.vae.decode(latents, )[0]
+        images_vae = self.vae.decode(
+            latents,
+        )[0]
         return images_vae
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            lora_scale: Optional[float]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            **kwargs, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        **kwargs,
+    ):
         if parse_prompt_type == "lpw":
             return self._encode_prompt_lpw(
                 prompt,
@@ -893,7 +842,8 @@ def _encode_prompt(
                 negative_prompt_embeds=negative_prompt_embeds,
                 lora_scale=lora_scale,
                 max_embeddings_multiples=max_embeddings_multiples,
-                **kwargs, )
+                **kwargs,
+            )
         elif parse_prompt_type == "raw":
             return self._encode_prompt_raw(
                 prompt,
@@ -902,22 +852,23 @@ def _encode_prompt(
                 negative_prompt=negative_prompt,
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
-                lora_scale=lora_scale, )
+                lora_scale=lora_scale,
+            )
         elif parse_prompt_type == "webui":
-            raise NotImplementedError(
-                "`parse_prompt_type=webui` is not implemented yet.")
+            raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.")
 
     def _encode_prompt_lpw(
-            self,
-            prompt: Union[str, List[str]],
-            num_images_per_prompt: int,
-            do_classifier_free_guidance: bool,
-            negative_prompt: Union[str, List[str]],
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            lora_scale: Optional[float]=None,
-            max_embeddings_multiples: Optional[int]=3,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Union[str, List[str]],
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        **kwargs,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -953,66 +904,63 @@ def _encode_prompt_lpw(
             if do_classifier_free_guidance:
                 if negative_prompt is None:
                     uncond_tokens = [""] * batch_size
-                elif prompt is not None and type(prompt) is not type(
-                        negative_prompt):
+                elif prompt is not None and type(prompt) is not type(negative_prompt):
                     raise TypeError(
                         f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                        f" {type(prompt)}.")
+                        f" {type(prompt)}."
+                    )
                 elif isinstance(negative_prompt, str):
                     uncond_tokens = [negative_prompt]
                 elif batch_size != len(negative_prompt):
                     raise ValueError(
                         f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                         f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                        " the batch size of `prompt`.")
+                        " the batch size of `prompt`."
+                    )
                 else:
                     uncond_tokens = negative_prompt
                 # textual inversion: procecss multi-vector tokens if necessary
                 if isinstance(self, TextualInversionLoaderMixin):
-                    uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                              self.tokenizer)
+                    uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings(
                 pipe=self,
                 prompt=prompt,
                 uncond_prompt=uncond_tokens,
                 max_embeddings_multiples=max_embeddings_multiples,
-                **kwargs, )
+                **kwargs,
+            )
 
         prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                dtype=self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     def _encode_prompt_raw(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            lora_scale: Optional[float]=None,
-            **kwargs, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -1059,32 +1007,36 @@ def _encode_prompt_raw(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            config = (self.text_encoder.config
-                      if isinstance(self.text_encoder.config, dict) else
-                      self.text_encoder.config.to_dict())
-            if (config.get("use_attention_mask", None) is not None and
-                    config["use_attention_mask"]):
+            config = (
+                self.text_encoder.config
+                if isinstance(self.text_encoder.config, dict)
+                else self.text_encoder.config.to_dict()
+            )
+            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
@@ -1092,33 +1044,32 @@ def _encode_prompt_raw(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(
-                    negative_prompt):
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -1126,39 +1077,38 @@ def _encode_prompt_raw(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            config = (self.text_encoder.config
-                      if isinstance(self.text_encoder.config, dict) else
-                      self.text_encoder.config.to_dict())
-            if (config.get("use_attention_mask", None) is not None and
-                    config["use_attention_mask"]):
+            config = (
+                self.text_encoder.config
+                if isinstance(self.text_encoder.config, dict)
+                else self.text_encoder.config.to_dict()
+            )
+            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                dtype=self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -1167,16 +1117,13 @@ def run_safety_checker(self, image, dtype):
             has_nsfw_concept = None
         else:
             if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(
-                    image, output_type="pil")
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
             else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(
-                    image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="pd")
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         return image, has_nsfw_concept
 
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -1185,26 +1132,25 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
 
 class StableDiffusionMegaPipeline(
-        DiffusionPipeline,
-        CommonMixIn,
-        FromCkptMixin,
-        LoraLoaderMixin,
-        TextualInversionLoaderMixin, ):
+    DiffusionPipeline,
+    CommonMixIn,
+    FromCkptMixin,
+    LoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
     r"""
     Pipeline for mega using Stable Diffusion.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -1239,37 +1185,33 @@ def __call__(self, *args, **kwargs):
         return self.text2img(*args, **kwargs)
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            controlnet: ControlNetModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -1277,11 +1219,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -1310,15 +1248,16 @@ def __init__(
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.control_image_processor = VaeImageProcessor(
             vae_scale_factor=self.vae_scale_factor,
             do_convert_rgb=True,
-            do_normalize=False, )
+            do_normalize=False,
+        )
         self.supported_scheduler = [
             "pndm",
             "lms",
@@ -1340,19 +1279,20 @@ def __init__(
 
     @paddle.no_grad()
     def do_unet(
-            self,
-            do_controlnet,
-            latents,
-            latent_model_input,
-            t,
-            i,
-            prompt_embeds,
-            control_image,
-            control_conditioning_scale,
-            cross_attention_kwargs,
-            guess_mode,
-            do_classifier_free_guidance,
-            is_scheduler_support_step_index=False, ):
+        self,
+        do_controlnet,
+        latents,
+        latent_model_input,
+        t,
+        i,
+        prompt_embeds,
+        control_image,
+        control_conditioning_scale,
+        cross_attention_kwargs,
+        guess_mode,
+        do_classifier_free_guidance,
+        is_scheduler_support_step_index=False,
+    ):
         if not do_controlnet:
             # predict the noise residual
             noise_pred_unet = self.unet(
@@ -1360,18 +1300,17 @@ def do_unet(
                 timestep=t,
                 encoder_hidden_states=prompt_embeds,
                 cross_attention_kwargs=cross_attention_kwargs,
-                return_dict=False, )[0]
+                return_dict=False,
+            )[0]
         else:
             # controlnet inference
             if guess_mode and do_classifier_free_guidance:
                 # Infer ControlNet only for the conditional batch.
                 control_model_input = latents
                 if is_scheduler_support_step_index:
-                    control_model_input = self.scheduler.scale_model_input(
-                        control_model_input, t, step_index=i)
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t, step_index=i)
                 else:
-                    control_model_input = self.scheduler.scale_model_input(
-                        control_model_input, t)
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
                 controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
             else:
                 control_model_input = latent_model_input
@@ -1384,20 +1323,15 @@ def do_unet(
                 controlnet_cond=control_image,
                 conditioning_scale=control_conditioning_scale,
                 guess_mode=guess_mode,
-                return_dict=False, )
+                return_dict=False,
+            )
 
             if guess_mode and do_classifier_free_guidance:
                 # Infered ControlNet only for the conditional batch.
                 # To apply the output of ControlNet to both the unconditional and conditional batches,
                 # add 0 to the unconditional batch to keep it unchanged.
-                down_block_res_samples = [
-                    paddle.concat([paddle.zeros_like(d), d])
-                    for d in down_block_res_samples
-                ]
-                mid_block_res_sample = paddle.concat([
-                    paddle.zeros_like(mid_block_res_sample),
-                    mid_block_res_sample
-                ])
+                down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+                mid_block_res_sample = paddle.concat([paddle.zeros_like(mid_block_res_sample), mid_block_res_sample])
 
             # predict the noise residual
             noise_pred_unet = self.unet(
@@ -1407,35 +1341,36 @@ def do_unet(
                 cross_attention_kwargs=cross_attention_kwargs,
                 down_block_additional_residuals=down_block_res_samples,
                 mid_block_additional_residual=mid_block_res_sample,
-                return_dict=False, )[0]
+                return_dict=False,
+            )[0]
         return noise_pred_unet
 
     @paddle.no_grad()
     def text2img(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -1535,7 +1470,8 @@ def text2img(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -1551,12 +1487,13 @@ def text2img(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions
-            if self.controlnet is not None else False)
+            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+        )
 
         # 3. Encode input prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
         prompt_embeds = self._encode_prompt(
             prompt,
             num_images_per_prompt,
@@ -1566,7 +1503,8 @@ def text2img(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         dtype = prompt_embeds.dtype
 
         # do_controlnet
@@ -1583,7 +1521,8 @@ def text2img(
                 dtype=dtype,
                 num_images_per_prompt=num_images_per_prompt,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         else:
             control_image = None
             control_conditioning_scale = None
@@ -1598,27 +1537,24 @@ def text2img(
             width,
             generator=generator,
             dtype=dtype,
-            latents=latents, )
+            latents=latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 noise_pred_unet = self.do_unet(
                     do_controlnet,
@@ -1637,10 +1573,8 @@ def text2img(
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
                 # compute the previous noisy sample x_t -> x_t-1
@@ -1651,22 +1585,19 @@ def text2img(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample.cast(dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
-            image = self._decode_vae_latents(latents /
-                                             self.vae.config.scaling_factor)
+            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
             image, has_nsfw_concept = self.run_safety_checker(image, dtype)
         else:
             image = latents
@@ -1677,43 +1608,41 @@ def text2img(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     def img2img(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -1828,10 +1757,10 @@ def img2img(
                 controlnet_conditioning_scale=controlnet_conditioning_scale,
                 guess_mode=guess_mode,
                 max_embeddings_multiples=max_embeddings_multiples,
-                parse_prompt_type=parse_prompt_type, )
+                parse_prompt_type=parse_prompt_type,
+            )
         # 0. Preprocess image
-        init_image = self.image_processor.preprocess(
-            image, height=height, width=width)
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs. Raise error if not correct
@@ -1843,7 +1772,8 @@ def img2img(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -1857,12 +1787,13 @@ def img2img(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
         guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions
-            if self.controlnet is not None else False)
+            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+        )
 
         # 3. Encode input prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
         prompt_embeds = self._encode_prompt(
             prompt,
             num_images_per_prompt,
@@ -1872,7 +1803,8 @@ def img2img(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         dtype = prompt_embeds.dtype
 
         # do_controlnet
@@ -1889,19 +1821,18 @@ def img2img(
                 dtype=dtype,
                 num_images_per_prompt=num_images_per_prompt,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         else:
             control_image = None
             control_conditioning_scale = None
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
 
         # 5. Prepare latent variables
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
         latents = self.prepare_latents(
@@ -1913,21 +1844,19 @@ def img2img(
             latents=latents,
             image=init_image,
             timestep=latent_timestep,
-            is_strength_max=is_strength_max, )
+            is_strength_max=is_strength_max,
+        )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 noise_pred_unet = self.do_unet(
                     do_controlnet,
@@ -1940,35 +1869,26 @@ def img2img(
                     control_conditioning_scale,
                     cross_attention_kwargs,
                     guess_mode,
-                    do_classifier_free_guidance, )
+                    do_classifier_free_guidance,
+                )
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False)[0]
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 latents = latents.cast(dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
-            image = self._decode_vae_latents(latents /
-                                             self.vae.config.scaling_factor)
+            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
             image, has_nsfw_concept = self.run_safety_checker(image, dtype)
         else:
             image = latents
@@ -1979,45 +1899,43 @@ def img2img(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     def inpaint_legacy(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: int=None,
-            width: int=None,
-            strength: float=1.0,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            add_predicted_noise: Optional[bool]=False,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: int = None,
+        width: int = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -2122,7 +2040,8 @@ def inpaint_legacy(
             mask_image,
             height,
             width,
-            return_image=True, )
+            return_image=True,
+        )
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs
@@ -2134,7 +2053,8 @@ def inpaint_legacy(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -2149,12 +2069,13 @@ def inpaint_legacy(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions
-            if self.controlnet is not None else False)
+            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+        )
 
         # 3. Encode input prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
         prompt_embeds = self._encode_prompt(
             prompt,
             num_images_per_prompt,
@@ -2164,7 +2085,8 @@ def inpaint_legacy(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         dtype = prompt_embeds.dtype
 
         # do_controlnet
@@ -2181,18 +2103,17 @@ def inpaint_legacy(
                 dtype=dtype,
                 num_images_per_prompt=num_images_per_prompt,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         else:
             control_image = None
             control_conditioning_scale = None
 
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
 
@@ -2208,7 +2129,8 @@ def inpaint_legacy(
             timestep=latent_timestep,
             is_strength_max=is_strength_max,
             return_noise=True,
-            return_image_latents=True, )
+            return_image_latents=True,
+        )
 
         # 6. Prepare mask latent variables
         mask = self.prepare_mask_latents(
@@ -2220,26 +2142,24 @@ def inpaint_legacy(
             dtype=dtype,
             generator=generator,
             do_classifier_free_guidance=do_classifier_free_guidance,
-            return_masked_image_latents=False, )
+            return_masked_image_latents=False,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         if do_classifier_free_guidance:
-            init_mask = mask[:mask.shape[0] // 2]
+            init_mask = mask[: mask.shape[0] // 2]
         else:
             init_mask = mask
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 noise_pred_unet = self.do_unet(
                     do_controlnet,
@@ -2252,51 +2172,39 @@ def inpaint_legacy(
                     control_conditioning_scale,
                     cross_attention_kwargs,
                     guess_mode,
-                    do_classifier_free_guidance, )
+                    do_classifier_free_guidance,
+                )
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False)[0]
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 if i < len(timesteps) - 1:
                     # masking
                     if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(
-                            image_latents, noise_pred_uncond, t)
+                        init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
                     else:
                         # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
                         noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            image_latents, noise, noise_timestep)
+                        init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
                 else:
                     init_latents_proper = image_latents
 
-                latents = (1 - init_mask
-                           ) * init_latents_proper + init_mask * latents
+                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
                 latents = latents.cast(dtype)
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
         if not output_type == "latent":
-            image = self._decode_vae_latents(latents /
-                                             self.vae.config.scaling_factor)
+            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
             image, has_nsfw_concept = self.run_safety_checker(image, dtype)
         else:
             image = latents
@@ -2307,45 +2215,43 @@ def inpaint_legacy(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     def inpaint(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: int=None,
-            width: int=None,
-            strength: float=1.0,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            add_predicted_noise: Optional[bool]=False,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: int = None,
+        width: int = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -2452,7 +2358,8 @@ def inpaint(
             mask_image,
             height,
             width,
-            return_image=True, )
+            return_image=True,
+        )
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs
@@ -2464,7 +2371,8 @@ def inpaint(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -2480,12 +2388,13 @@ def inpaint(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions
-            if self.controlnet is not None else False)
+            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+        )
 
         # 3. Encode input prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
         prompt_embeds = self._encode_prompt(
             prompt,
             num_images_per_prompt,
@@ -2495,16 +2404,15 @@ def inpaint(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         dtype = prompt_embeds.dtype
 
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
 
@@ -2524,7 +2432,8 @@ def inpaint(
             timestep=latent_timestep,
             is_strength_max=is_strength_max,
             return_noise=True,
-            return_image_latents=return_image_latents, )
+            return_image_latents=return_image_latents,
+        )
 
         if return_image_latents:
             latents, noise, image_latents = latents_outputs
@@ -2541,29 +2450,27 @@ def inpaint(
             dtype=dtype,
             generator=generator,
             do_classifier_free_guidance=do_classifier_free_guidance,
-            return_masked_image_latents=True, )
+            return_masked_image_latents=True,
+        )
 
         # 7. Check that sizes of mask, masked image and latents match
         if num_channels_unet == 9:
             # default case for runwayml/stable-diffusion-inpainting
             num_channels_mask = mask.shape[1]
             num_channels_masked_image = masked_image_latents.shape[1]
-            if (num_channels_latents + num_channels_mask +
-                    num_channels_masked_image != self.unet.config.in_channels):
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
                 raise ValueError(
                     f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                     f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                     f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
                     f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input.")
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
         elif num_channels_unet != 4:
-            raise ValueError(
-                f"The unet should have either 4 or 9 input channels, not {num_channels_unet}."
-            )
+            raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.")
 
         # do_controlnet
-        do_controlnet = (controlnet_cond is not None and
-                         self.controlnet is not None and is_legacy)
+        do_controlnet = controlnet_cond is not None and self.controlnet is not None and is_legacy
         if not do_controlnet:
             guess_mode = False
         if do_controlnet:
@@ -2576,7 +2483,8 @@ def inpaint(
                 num_images_per_prompt=num_images_per_prompt,
                 dtype=dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         else:
             control_image = None
             control_conditioning_scale = None
@@ -2584,26 +2492,21 @@ def inpaint(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         if do_classifier_free_guidance:
-            init_mask = mask[:mask.shape[0] // 2]
+            init_mask = mask[: mask.shape[0] // 2]
         else:
             init_mask = mask
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 if not is_legacy:
                     # concat latents, mask, masked_image_latents in the channel dimension
-                    latent_model_input = paddle.concat(
-                        [latent_model_input, mask, masked_image_latents],
-                        axis=1)
+                    latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
 
                 noise_pred_unet = self.do_unet(
                     do_controlnet,
@@ -2616,51 +2519,39 @@ def inpaint(
                     control_conditioning_scale,
                     cross_attention_kwargs,
                     guess_mode,
-                    do_classifier_free_guidance, )
+                    do_classifier_free_guidance,
+                )
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False)[0]
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 if is_legacy:
                     if i < len(timesteps) - 1:
                         # masking
                         if add_predicted_noise:
-                            init_latents_proper = self.scheduler.add_noise(
-                                image_latents, noise_pred_uncond, t)
+                            init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
                         else:
                             # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
                             noise_timestep = timesteps[i + 1]
-                            init_latents_proper = self.scheduler.add_noise(
-                                image_latents, noise, noise_timestep)
+                            init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
                     else:
                         init_latents_proper = image_latents
-                    latents = (1 - init_mask
-                               ) * init_latents_proper + init_mask * latents
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
                 latents = latents.cast(dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
-            image = self._decode_vae_latents(latents /
-                                             self.vae.config.scaling_factor)
+            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
             image, has_nsfw_concept = self.run_safety_checker(image, dtype)
         else:
             image = latents
@@ -2671,57 +2562,54 @@ def inpaint(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     def check_inputs_hires_fix(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            hr_scale,
-            hr_resize_height,
-            hr_resize_width,
-            denoising_strength,
-            latent_scale_mode,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        hr_scale,
+        hr_resize_height,
+        hr_resize_width,
+        denoising_strength,
+        latent_scale_mode,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
             raise ValueError(
                 f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
             )
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if hr_scale < 0:
-            raise ValueError(
-                "hr_scale shoule be greater that 0, but acceived {hr_scale}")
+            raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
 
         if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
             raise ValueError(
@@ -2729,9 +2617,7 @@ def check_inputs_hires_fix(
             )
 
         if denoising_strength > 1 or denoising_strength < 0:
-            raise ValueError(
-                f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}"
-            )
+            raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -2749,14 +2635,10 @@ def check_inputs_hires_fix(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
-
-    def get_upscaled_width_and_height(self,
-                                      width,
-                                      height,
-                                      hr_scale=2,
-                                      hr_resize_width=0,
-                                      hr_resize_height=0):
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
         if hr_resize_width == 0 and hr_resize_height == 0:
             hr_upscale_to_width = int(width * hr_scale)
             hr_upscale_to_height = int(height * hr_scale)
@@ -2783,42 +2665,42 @@ def get_upscaled_width_and_height(self,
     def get_hires_fix_timesteps(self, denoising_steps, denoising_strength):
         steps = int(denoising_steps / min(denoising_strength, 0.999))
         self.scheduler.set_timesteps(steps)
-        timesteps = self.scheduler.timesteps[steps - denoising_steps:]
+        timesteps = self.scheduler.timesteps[steps - denoising_steps :]
         return timesteps, denoising_steps
 
     @paddle.no_grad()
     def hires_fix(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=40,
-            hires_ratio: Optional[float]=0.5,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            enable_hr: Optional[bool]=True,
-            hr_scale: Optional[float]=2.0,
-            hr_resize_width: Optional[int]=0,
-            hr_resize_height: Optional[int]=0,
-            denoising_strength: Optional[float]=0.7,
-            latent_scale_mode: Optional[str]="nearest",
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 40,
+        hires_ratio: Optional[float] = 0.5,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        enable_hr: Optional[bool] = True,
+        hr_scale: Optional[float] = 2.0,
+        hr_resize_width: Optional[int] = 0,
+        hr_resize_height: Optional[int] = 0,
+        denoising_strength: Optional[float] = 0.7,
+        latent_scale_mode: Optional[str] = "nearest",
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -2942,7 +2824,8 @@ def hires_fix(
             latent_scale_mode,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -2958,12 +2841,13 @@ def hires_fix(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions
-            if self.controlnet is not None else False)
+            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
+        )
 
         # 3. Encode input prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
         prompt_embeds = self._encode_prompt(
             prompt,
             num_images_per_prompt,
@@ -2973,7 +2857,8 @@ def hires_fix(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         dtype = prompt_embeds.dtype
 
         # do_controlnet
@@ -2990,7 +2875,8 @@ def hires_fix(
                 dtype=dtype,
                 num_images_per_prompt=num_images_per_prompt,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         else:
             control_image = None
             control_conditioning_scale = None
@@ -3009,11 +2895,9 @@ def hires_fix(
         # 5. Prepare latent variables
         if generator is None:
             generator_state = paddle.get_cuda_rng_state()
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(
-                generator_state)
+            paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
         else:
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(
-                paddle.Generator().states_[generator])
+            paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
 
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -3021,7 +2905,8 @@ def hires_fix(
             width,
             generator=generator,
             dtype=dtype,
-            latents=latents, )
+            latents=latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -3032,10 +2917,8 @@ def hires_fix(
         with self.progress_bar(total=sample_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 noise_pred_unet = self.do_unet(
                     do_controlnet,
@@ -3048,25 +2931,21 @@ def hires_fix(
                     control_conditioning_scale,
                     cross_attention_kwargs,
                     guess_mode,
-                    do_classifier_free_guidance, )
+                    do_classifier_free_guidance,
+                )
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
 
                 # compute the previous noisy sample x_t -> x_t-1
-                scheduler_output = self.scheduler.step(noise_pred, t, latents,
-                                                       **extra_step_kwargs)
+                scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample.cast(dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -3076,19 +2955,16 @@ def hires_fix(
             # 8. determine the upscaled width and height for upscaled images
             truncate_width = 0
             truncate_height = 0
-            (
-                hr_upscale_to_width,
-                hr_upscale_to_height, ) = self.get_upscaled_width_and_height(
-                    width,
-                    height,
-                    hr_scale=hr_scale,
-                    hr_resize_width=hr_resize_width,
-                    hr_resize_height=hr_resize_height, )
+            (hr_upscale_to_width, hr_upscale_to_height,) = self.get_upscaled_width_and_height(
+                width,
+                height,
+                hr_scale=hr_scale,
+                hr_resize_width=hr_resize_width,
+                hr_resize_height=hr_resize_height,
+            )
             if hr_resize_width != 0 and hr_resize_height != 0:
-                truncate_width = (hr_upscale_to_width - hr_resize_width
-                                  ) // self.vae_scale_factor
-                truncate_height = (hr_upscale_to_height - hr_resize_height
-                                   ) // self.vae_scale_factor
+                truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
+                truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
 
             # 9. special case: do nothing if upscaling is not nesscessary
             if hr_upscale_to_width == width and hr_upscale_to_height == height:
@@ -3097,10 +2973,7 @@ def hires_fix(
 
         if enable_hr:
             if do_controlnet:
-                (
-                    control_image,
-                    control_conditioning_scale,
-                ) = self.prepare_controlnet_cond(
+                (control_image, control_conditioning_scale,) = self.prepare_controlnet_cond(
                     controlnet_cond=controlnet_cond,
                     controlnet_conditioning_scale=controlnet_conditioning_scale,
                     width=hr_upscale_to_width,
@@ -3109,45 +2982,43 @@ def hires_fix(
                     num_images_per_prompt=num_images_per_prompt,
                     dtype=dtype,
                     do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode, )
+                    guess_mode=guess_mode,
+                )
             else:
                 control_image = None
                 control_conditioning_scale = None
             # 10. prepare init latents
-            timesteps, hr_steps = self.get_hires_fix_timesteps(
-                hr_steps, denoising_strength)
+            timesteps, hr_steps = self.get_hires_fix_timesteps(hr_steps, denoising_strength)
             init_timestep = timesteps[:1].tile([latents.shape[0]])
 
             latents = paddle.nn.functional.interpolate(
                 latents,
                 size=(
                     hr_upscale_to_height // self.vae_scale_factor,
-                    hr_upscale_to_width // self.vae_scale_factor, ),
-                mode=latent_scale_mode, )
-            latents = latents[:, :, truncate_height // 2:latents.shape[2] - (
-                truncate_height + 1) // 2, truncate_width // 2:latents.shape[3]
-                              - (truncate_width + 1) // 2, ]
-
-            noise = randn_tensor(
-                latents.shape,
-                dtype=latents.dtype,
-                generator="initial_generator")
+                    hr_upscale_to_width // self.vae_scale_factor,
+                ),
+                mode=latent_scale_mode,
+            )
+            latents = latents[
+                :,
+                :,
+                truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
+                truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
+            ]
+
+            noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
             latents = self.scheduler.add_noise(latents, noise, init_timestep)
 
             # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(
-                "initial_generator", eta)
+            extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
 
             # 12. denoising on hires.fix steps
             num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
             with self.progress_bar(total=hr_steps) as progress_bar:
                 for i, t in enumerate(timesteps):
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat([latents] * 2)
-                                          if do_classifier_free_guidance else
-                                          latents)
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                     noise_pred_unet = self.do_unet(
                         do_controlnet,
@@ -3160,31 +3031,26 @@ def hires_fix(
                         control_conditioning_scale,
                         cross_attention_kwargs,
                         guess_mode,
-                        do_classifier_free_guidance, )
+                        do_classifier_free_guidance,
+                    )
                     # perform guidance
                     if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                            2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (
-                            noise_pred_text - noise_pred_uncond)
+                        noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                     else:
                         noise_pred = noise_pred_unet
 
                     # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
                     latents = latents.cast(dtype)
                     # call the callback, if provided
-                    if i == len(timesteps) - 1 or (
-                        (i + 1) > num_warmup_steps and
-                        (i + 1) % self.scheduler.order == 0):
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                         progress_bar.update()
                         if callback is not None and i % callback_steps == 0:
                             callback(i, t, latents)
 
         if not output_type == "latent":
-            image = self._decode_vae_latents(latents /
-                                             self.vae.config.scaling_factor)
+            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
             image, has_nsfw_concept = self.run_safety_checker(image, dtype)
         else:
             image = latents
@@ -3195,42 +3061,40 @@ def hires_fix(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     def cycle_diffusion(
-            self,
-            prompt: Union[str, List[str]],
-            source_prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[paddle.Tensor]=None,
-            source_guidance_scale: Optional[float]=1,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.1,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[paddle.Tensor] = None,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -3310,8 +3174,7 @@ def cycle_diffusion(
         """
         self.change_scheduler("ddim")
         # 0. Preprocess image
-        init_image = self.image_processor.preprocess(
-            image, height=height, width=width)
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs
@@ -3323,7 +3186,8 @@ def cycle_diffusion(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -3339,8 +3203,9 @@ def cycle_diffusion(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode target prompt and source prompt
-        text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if
-                                   cross_attention_kwargs is not None else None)
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
 
         prompt_embeds = self._encode_prompt(
             prompt,
@@ -3351,24 +3216,24 @@ def cycle_diffusion(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         source_prompt_embeds = self._encode_prompt(
             source_prompt,
             num_images_per_prompt,
             do_classifier_free_guidance,
             lora_scale=text_encoder_lora_scale,
             max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type, )
+            parse_prompt_type=parse_prompt_type,
+        )
         dtype = prompt_embeds.dtype
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         # 6. Prepare latent variables
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         is_strength_max = strength == 1.0
         latents, clean_latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -3380,7 +3245,8 @@ def cycle_diffusion(
             image=init_image,
             timestep=latent_timestep,
             is_strength_max=is_strength_max,
-            return_image_latents=True, )
+            return_image_latents=True,
+        )
         source_latents = latents
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -3388,18 +3254,15 @@ def cycle_diffusion(
         generator = extra_step_kwargs.pop("generator", None)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = paddle.concat([latents] * 2)
                 source_latent_model_input = paddle.concat([source_latents] * 2)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                source_latent_model_input = self.scheduler.scale_model_input(
-                    source_latent_model_input, t)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
 
                 # predict the noise residual
                 concat_latent_model_input = paddle.stack(
@@ -3409,7 +3272,8 @@ def cycle_diffusion(
                         source_latent_model_input[1],
                         latent_model_input[1],
                     ],
-                    axis=0, )
+                    axis=0,
+                )
                 concat_prompt_embeds = paddle.stack(
                     [
                         source_prompt_embeds[0],
@@ -3417,7 +3281,8 @@ def cycle_diffusion(
                         source_prompt_embeds[1],
                         prompt_embeds[1],
                     ],
-                    axis=0, )
+                    axis=0,
+                )
 
                 # predict the noise residual
                 concat_noise_pred = self.unet(
@@ -3425,19 +3290,20 @@ def cycle_diffusion(
                     timestep=t,
                     encoder_hidden_states=concat_prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 (
                     source_noise_pred_uncond,
                     noise_pred_uncond,
                     source_noise_pred_text,
-                    noise_pred_text, ) = concat_noise_pred.chunk(
-                        4, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                    noise_pred_text,
+                ) = concat_noise_pred.chunk(4, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
-                    source_noise_pred_text - source_noise_pred_uncond)
+                    source_noise_pred_text - source_noise_pred_uncond
+                )
 
                 # Sample source_latents from the posterior distribution.
                 prev_source_latents = posterior_sample(
@@ -3446,7 +3312,8 @@ def cycle_diffusion(
                     t,
                     clean_latents,
                     generator=generator,
-                    **extra_step_kwargs, )
+                    **extra_step_kwargs,
+                )
                 # Compute noise.
                 noise = compute_noise(
                     self.scheduler,
@@ -3454,29 +3321,24 @@ def cycle_diffusion(
                     source_latents,
                     t,
                     source_noise_pred,
-                    **extra_step_kwargs, )
+                    **extra_step_kwargs,
+                )
                 source_latents = prev_source_latents.cast(dtype)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    variance_noise=noise,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+                ).prev_sample
 
                 latents = latents.cast(dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
-            image = self._decode_vae_latents(latents /
-                                             self.vae.config.scaling_factor)
+            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
             image, has_nsfw_concept = self.run_safety_checker(image, dtype)
         else:
             image = latents
@@ -3487,11 +3349,9 @@ def cycle_diffusion(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/webui_stable_diffusion.py b/ppdiffusers/examples/community/webui_stable_diffusion.py
index cad5739c1f3c5..c5c7cd4c8c0a9 100644
--- a/ppdiffusers/examples/community/webui_stable_diffusion.py
+++ b/ppdiffusers/examples/community/webui_stable_diffusion.py
@@ -25,22 +25,27 @@
 import paddle.nn as nn
 import PIL
 import PIL.Image
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers.models import (AutoencoderKL, ControlNetModel,
-                                UNet2DConditionModel)
+from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ppdiffusers.models.controlnet import ControlNetOutput
 from ppdiffusers.models.modeling_utils import ModelMixin
 from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (PIL_INTERPOLATION, PPDIFFUSERS_CACHE, logging,
-                               ppdiffusers_url_download, randn_tensor,
-                               safetensors_load, smart_load, torch_load)
+from ppdiffusers.utils import (
+    PIL_INTERPOLATION,
+    PPDIFFUSERS_CACHE,
+    logging,
+    ppdiffusers_url_download,
+    randn_tensor,
+    safetensors_load,
+    smart_load,
+    torch_load,
+)
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -89,8 +94,7 @@ def resize(im, w, h):
 
         resized = resize(im, src_w, src_h)
         res = Image.new("RGB", (width, height))
-        res.paste(
-            resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
 
     else:
         ratio = width / height
@@ -101,31 +105,22 @@ def resize(im, w, h):
 
         resized = resize(im, src_w, src_h)
         res = Image.new("RGB", (width, height))
-        res.paste(
-            resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
 
         if ratio < src_ratio:
             fill_height = height // 2 - src_h // 2
+            res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
             res.paste(
-                resized.resize(
-                    (width, fill_height), box=(0, 0, width, 0)),
-                box=(0, 0))
-            res.paste(
-                resized.resize(
-                    (width, fill_height),
-                    box=(0, resized.height, width, resized.height)),
-                box=(0, fill_height + src_h), )
+                resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+                box=(0, fill_height + src_h),
+            )
         elif ratio > src_ratio:
             fill_width = width // 2 - src_w // 2
+            res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
             res.paste(
-                resized.resize(
-                    (fill_width, height), box=(0, 0, 0, height)),
-                box=(0, 0))
-            res.paste(
-                resized.resize(
-                    (fill_width, height),
-                    box=(resized.width, 0, resized.width, height)),
-                box=(fill_width + src_w, 0), )
+                resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+                box=(fill_width + src_w, 0),
+            )
 
     return res
 
@@ -137,8 +132,7 @@ def get_civitai_download_url(display_url, url_prefix="https://civitai.com"):
     import requests
 
     headers = {
-        "User-Agent":
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"
+        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"
     }
     r = requests.get(display_url, headers=headers)
     soup = bs4.BeautifulSoup(r.text, "lxml")
@@ -151,12 +145,13 @@ def get_civitai_download_url(display_url, url_prefix="https://civitai.com"):
 
 
 def http_file_name(
-        url: str,
-        *,
-        proxies=None,
-        headers: Optional[Dict[str, str]]=None,
-        timeout=10.0,
-        max_retries=0, ):
+    url: str,
+    *,
+    proxies=None,
+    headers: Optional[Dict[str, str]] = None,
+    timeout=10.0,
+    max_retries=0,
+):
     """
     Get a remote file name.
     """
@@ -168,7 +163,8 @@ def http_file_name(
         proxies=proxies,
         headers=headers,
         timeout=timeout,
-        max_retries=max_retries, )
+        max_retries=max_retries,
+    )
     hf_raise_for_status(r)
     displayed_name = url.split("/")[-1]
     content_disposition = r.headers.get("Content-Disposition")
@@ -180,11 +176,12 @@ def http_file_name(
 
 @paddle.no_grad()
 def load_lora(
-        pipeline,
-        state_dict: dict,
-        LORA_PREFIX_UNET: str="lora_unet",
-        LORA_PREFIX_TEXT_ENCODER: str="lora_te",
-        ratio: float=1.0, ):
+    pipeline,
+    state_dict: dict,
+    LORA_PREFIX_UNET: str = "lora_unet",
+    LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
+    ratio: float = 1.0,
+):
     ratio = float(ratio)
     visited = []
     for key in state_dict:
@@ -192,8 +189,7 @@ def load_lora(
             continue
 
         if "text" in key:
-            tmp_layer_infos = (key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER
-                                                       + "_")[-1].split("_"))
+            tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
             hf_to_ppnlp = {
                 "encoder": "transformer",
                 "fc1": "linear1",
@@ -206,8 +202,7 @@ def load_lora(
                 layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
             curr_layer: paddle.nn.Linear = pipeline.text_encoder
         else:
-            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[
-                -1].split("_")
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
             curr_layer: paddle.nn.Linear = pipeline.unet
 
         temp_name = layer_infos.pop(0)
@@ -248,24 +243,29 @@ def load_lora(
             if weight_down.shape[2:4] == [1, 1]:
                 # conv2d 1x1
                 curr_layer.weight.copy_(
-                    curr_layer.weight + ratio * paddle.matmul(
-                        weight_up.squeeze([-1, -2]),
-                        weight_down.squeeze([-1, -2])).unsqueeze([-1, -2]) *
-                    scale,
-                    True, )
+                    curr_layer.weight
+                    + ratio
+                    * paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
+                    * scale,
+                    True,
+                )
             else:
                 # conv2d 3x3
                 curr_layer.weight.copy_(
-                    curr_layer.weight + ratio * paddle.nn.functional.conv2d(
-                        weight_down.transpose([1, 0, 2, 3]),
-                        weight_up).transpose([1, 0, 2, 3]) * scale,
-                    True, )
+                    curr_layer.weight
+                    + ratio
+                    * paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
+                        [1, 0, 2, 3]
+                    )
+                    * scale,
+                    True,
+                )
         else:
             # linear
             curr_layer.weight.copy_(
-                curr_layer.weight + ratio * paddle.matmul(
-                    weight_up, weight_down).T * scale,
-                True, )
+                curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale,
+                True,
+            )
 
         # update visited list
         visited.extend(triplet_keys)
@@ -285,28 +285,25 @@ class MultiControlNetModel(ModelMixin):
             `ControlNetModel` as a list.
     """
 
-    def __init__(
-            self,
-            controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+    def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
         super().__init__()
         self.nets = nn.LayerList(controlnets)
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            encoder_hidden_states: paddle.Tensor,
-            controlnet_cond: List[paddle.Tensor],
-            conditioning_scale: List[float],
-            class_labels: Optional[paddle.Tensor]=None,
-            timestep_cond: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            guess_mode: bool=False,
-            return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]:
-        for i, (
-                image, scale, controlnet
-        ) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        controlnet_cond: List[paddle.Tensor],
+        conditioning_scale: List[float],
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
             down_samples, mid_sample = controlnet(
                 sample,
                 timestep,
@@ -318,7 +315,8 @@ def forward(
                 attention_mask,
                 cross_attention_kwargs,
                 guess_mode,
-                return_dict, )
+                return_dict,
+            )
 
             # merge samples
             if i == 0:
@@ -326,8 +324,7 @@ def forward(
             else:
                 down_block_res_samples = [
                     samples_prev + samples_curr
-                    for samples_prev, samples_curr in zip(
-                        down_block_res_samples, down_samples)
+                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
                 ]
                 mid_block_res_sample += mid_sample
 
@@ -373,17 +370,22 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline):
     TI_DIR = os.path.join(PPDIFFUSERS_CACHE, "textual_inversion")
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[
-                ControlNetModel], MultiControlNetModel, ]=None,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        controlnet: Union[
+            ControlNetModel,
+            List[ControlNetModel],
+            Tuple[ControlNetModel],
+            MultiControlNetModel,
+        ] = None,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -413,8 +415,9 @@ def __init__(
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
         # custom data
@@ -441,9 +444,9 @@ def __init__(
         # register_state_dict_hook to fix text_encoder, when we save_pretrained text model.
         def map_to(state_dict, *args, **kwargs):
             if "text_model.token_embedding.wrapped.weight" in state_dict:
-                state_dict[
-                    "text_model.token_embedding.weight"] = state_dict.pop(
-                        "text_model.token_embedding.wrapped.weight")
+                state_dict["text_model.token_embedding.weight"] = state_dict.pop(
+                    "text_model.token_embedding.wrapped.weight"
+                )
             return state_dict
 
         self.text_encoder.register_state_dict_hook(map_to)
@@ -466,7 +469,8 @@ def download_civitai_lora_file(self, url):
         file_path = ppdiffusers_url_download(
             download_url,
             cache_dir=self.LORA_DIR,
-            filename=http_file_name(download_url).strip('"'), )
+            filename=http_file_name(download_url).strip('"'),
+        )
         return file_path
 
     def download_civitai_ti_file(self, url):
@@ -479,7 +483,8 @@ def download_civitai_ti_file(self, url):
         file_path = ppdiffusers_url_download(
             download_url,
             cache_dir=self.TI_DIR,
-            filename=http_file_name(download_url).strip('"'), )
+            filename=http_file_name(download_url).strip('"'),
+        )
         return file_path
 
     def change_scheduler(self, scheduler_type="ddim"):
@@ -488,55 +493,56 @@ def change_scheduler(self, scheduler_type="ddim"):
     def switch_scheduler(self, scheduler_type="ddim"):
         scheduler_type = scheduler_type.lower()
         from ppdiffusers import (
-            DDIMScheduler, DDPMScheduler, DEISMultistepScheduler,
-            DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
-            EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-            HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler,
-            KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
-            UniPCMultistepScheduler)
+            DDIMScheduler,
+            DDPMScheduler,
+            DEISMultistepScheduler,
+            DPMSolverMultistepScheduler,
+            DPMSolverSinglestepScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            HeunDiscreteScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            LMSDiscreteScheduler,
+            PNDMScheduler,
+            UniPCMultistepScheduler,
+        )
 
         if scheduler_type == "pndm":
-            scheduler = PNDMScheduler.from_config(
-                self.orginal_scheduler_config, skip_prk_steps=True)
+            scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
         elif scheduler_type == "lms":
-            scheduler = LMSDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "heun":
-            scheduler = HeunDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "euler":
-            scheduler = EulerDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "euler-ancestral":
-            scheduler = EulerAncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "dpm-multi":
-            scheduler = DPMSolverMultistepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "dpm-single":
-            scheduler = DPMSolverSinglestepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "kdpm2-ancestral":
-            scheduler = KDPM2AncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "kdpm2":
-            scheduler = KDPM2DiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "unipc-multi":
-            scheduler = UniPCMultistepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "ddim":
             scheduler = DDIMScheduler.from_config(
                 self.orginal_scheduler_config,
                 steps_offset=1,
                 clip_sample=False,
-                set_alpha_to_one=False, )
+                set_alpha_to_one=False,
+            )
         elif scheduler_type == "ddpm":
-            scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config,
-                                                  )
+            scheduler = DDPMScheduler.from_config(
+                self.orginal_scheduler_config,
+            )
         elif scheduler_type == "deis-multi":
             scheduler = DEISMultistepScheduler.from_config(
-                self.orginal_scheduler_config, )
+                self.orginal_scheduler_config,
+            )
         else:
             raise ValueError(
                 f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
@@ -545,30 +551,28 @@ def switch_scheduler(self, scheduler_type="ddim"):
 
     @paddle.no_grad()
     def _encode_prompt(
-            self,
-            prompt: str,
-            do_classifier_free_guidance: float=7.5,
-            negative_prompt: str=None,
-            num_inference_steps: int=50, ):
+        self,
+        prompt: str,
+        do_classifier_free_guidance: float = 7.5,
+        negative_prompt: str = None,
+        num_inference_steps: int = 50,
+    ):
         if do_classifier_free_guidance:
             assert isinstance(negative_prompt, str)
             negative_prompt = [negative_prompt]
-            uc = get_learned_conditioning(self.sj.clip, negative_prompt,
-                                          num_inference_steps)
+            uc = get_learned_conditioning(self.sj.clip, negative_prompt, num_inference_steps)
         else:
             uc = None
 
-        c = get_multicond_learned_conditioning(self.sj.clip, prompt,
-                                               num_inference_steps)
+        c = get_multicond_learned_conditioning(self.sj.clip, prompt, num_inference_steps)
         return c, uc
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -587,48 +591,43 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            controlnet_conditioning_scale=1.0, ):
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        controlnet_conditioning_scale=1.0,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and not isinstance(prompt, str):
-            raise ValueError(
-                f"`prompt` has to be of type `str` but is {type(prompt)}")
+            raise ValueError(f"`prompt` has to be of type `str` but is {type(prompt)}")
 
         if negative_prompt is not None and not isinstance(negative_prompt, str):
-            raise ValueError(
-                f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}"
-            )
+            raise ValueError(f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}")
 
         # `prompt` needs more sophisticated handling when there are multiple
         # conditionings.
@@ -645,15 +644,12 @@ def check_inputs(
                 self.check_image(image, prompt)
             elif isinstance(self.controlnet, MultiControlNetModel):
                 if not isinstance(image, list):
-                    raise TypeError(
-                        "For multiple controlnets: `image` must be type `list`")
+                    raise TypeError("For multiple controlnets: `image` must be type `list`")
 
                 # When `image` is a nested list:
                 # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
                 elif any(isinstance(i, list) for i in image):
-                    raise ValueError(
-                        "A single batch of multiple conditionings are supported at the moment."
-                    )
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
                 elif len(image) != len(self.controlnet.nets):
                     raise ValueError(
                         "For multiple controlnets: `image` must have the same length as the number of controlnets."
@@ -666,39 +662,31 @@ def check_inputs(
 
             # Check `controlnet_conditioning_scale`
             if isinstance(self.controlnet, ControlNetModel):
-                if not isinstance(controlnet_conditioning_scale,
-                                  (float, list, tuple)):
+                if not isinstance(controlnet_conditioning_scale, (float, list, tuple)):
                     raise TypeError(
                         "For single controlnet: `controlnet_conditioning_scale` must be type `float, list(float) or tuple(float)`."
                     )
             elif isinstance(self.controlnet, MultiControlNetModel):
                 if isinstance(controlnet_conditioning_scale, list):
-                    if any(
-                            isinstance(i, list)
-                            for i in controlnet_conditioning_scale):
-                        raise ValueError(
-                            "A single batch of multiple conditionings are supported at the moment."
-                        )
-                elif isinstance(
-                        controlnet_conditioning_scale,
-                        list) and len(controlnet_conditioning_scale) != len(
-                            self.controlnet.nets):
+                    if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                        raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+                elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                    self.controlnet.nets
+                ):
                     raise ValueError(
                         "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                        " the same length as the number of controlnets")
+                        " the same length as the number of controlnets"
+                    )
             else:
                 assert False
 
     def check_image(self, image, prompt):
         image_is_pil = isinstance(image, PIL.Image.Image)
         image_is_tensor = isinstance(image, paddle.Tensor)
-        image_is_pil_list = isinstance(image, list) and isinstance(
-            image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(
-            image[0], paddle.Tensor)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
 
-        if (not image_is_pil and not image_is_tensor and
-                not image_is_pil_list and not image_is_tensor_list):
+        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
             raise TypeError(
                 "image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors"
             )
@@ -725,27 +713,16 @@ def check_image(self, image, prompt):
     def prepare_image(self, image, width, height, dtype, resize_mode=-1):
         if not isinstance(image, paddle.Tensor):
             if isinstance(image, PIL.Image.Image):
-                image = resize_image(
-                    resize_mode=resize_mode,
-                    im=image,
-                    width=width,
-                    height=height)
+                image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height)
                 image = [image]
 
             if isinstance(image[0], PIL.Image.Image):
-                image = [
-                    resize_image(
-                        resize_mode=resize_mode,
-                        im=im,
-                        width=width,
-                        height=height) for im in image
-                ]
+                image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image]
 
                 images = []
                 for image_ in image:
                     image_ = image_.convert("RGB")
-                    image_ = image_.resize(
-                        (width, height), resample=PIL_INTERPOLATION["lanczos"])
+                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
                     image_ = np.array(image_)
                     image_ = image_[None, :]
                     images.append(image_)
@@ -761,14 +738,15 @@ def prepare_image(self, image, width, height, dtype, resize_mode=-1):
         return image
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -812,31 +790,31 @@ def _default_height_width(self, height, width, image):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: str=None,
-            image: PIL.Image.Image=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: str=None,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            clip_skip: int=1,
-            controlnet_conditioning_scale: Union[float, List[float]]=1.0,
-            enable_lora: bool=True,
-            resize_mode: int=0,
-            # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
-            #         0              1                   2               -1
-            starting_control_step: float=0.0,
-            ending_control_step: float=1.0, ):
+        self,
+        prompt: str = None,
+        image: PIL.Image.Image = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: str = None,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = 1,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        enable_lora: bool = True,
+        resize_mode: int = 0,
+        # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
+        #         0              1                   2               -1
+        starting_control_step: float = 0.0,
+        ending_control_step: float = 1.0,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -914,17 +892,16 @@ def __call__(
             # 0. Default height and width to unet
             if enable_control:
                 if isinstance(self.controlnet, ControlNetModel):
-                    height, width = self._default_height_width(height, width,
-                                                               image)
+                    height, width = self._default_height_width(height, width, image)
                     image = self.prepare_image(
                         image=image,
                         width=width,
                         height=height,
                         dtype=self.controlnet.dtype,
-                        resize_mode=resize_mode, )
+                        resize_mode=resize_mode,
+                    )
                 elif isinstance(self.controlnet, MultiControlNetModel):
-                    height, width = self._default_height_width(height, width,
-                                                               image)
+                    height, width = self._default_height_width(height, width, image)
                     images = []
 
                     for image_ in image:
@@ -933,16 +910,15 @@ def __call__(
                             width=width,
                             height=height,
                             dtype=self.controlnet.dtype,
-                            resize_mode=resize_mode, )
+                            resize_mode=resize_mode,
+                        )
 
                         images.append(image_)
 
                     image = images
             else:
-                height = height or max(self.unet.config.sample_size *
-                                       self.vae_scale_factor, 512)
-                width = width or max(self.unet.config.sample_size *
-                                     self.vae_scale_factor, 512)
+                height = height or max(self.unet.config.sample_size * self.vae_scale_factor, 512)
+                width = width or max(self.unet.config.sample_size * self.vae_scale_factor, 512)
 
             # 1. Check inputs. Raise error if not correct
             self.check_inputs(
@@ -952,7 +928,8 @@ def __call__(
                 width,
                 callback_steps,
                 negative_prompt,
-                controlnet_conditioning_scale, )
+                controlnet_conditioning_scale,
+            )
 
             # 2. Define call parameters
             batch_size = 1
@@ -966,47 +943,34 @@ def __call__(
 
             if enable_lora and self.LORA_DIR is not None:
                 if os.path.exists(self.LORA_DIR):
-                    lora_mapping = {
-                        p.stem: p.absolute()
-                        for p in Path(self.LORA_DIR).glob("*.safetensors")
-                    }
+                    lora_mapping = {p.stem: p.absolute() for p in Path(self.LORA_DIR).glob("*.safetensors")}
                     for params in extra_network_data["lora"]:
                         assert len(params.items) > 0
                         name = params.items[0]
                         if name in lora_mapping:
-                            ratio = (float(params.items[1])
-                                     if len(params.items) > 1 else 1.0)
-                            lora_state_dict = smart_load(
-                                lora_mapping[name],
-                                map_location=paddle.get_device())
+                            ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
+                            lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
                             self.weights_has_changed = True
-                            load_lora(
-                                self, state_dict=lora_state_dict, ratio=ratio)
+                            load_lora(self, state_dict=lora_state_dict, ratio=ratio)
                             del lora_state_dict
                         else:
-                            print(
-                                f"We can't find lora weight: {name}! Please make sure that exists!"
-                            )
+                            print(f"We can't find lora weight: {name}! Please make sure that exists!")
                 else:
                     if len(extra_network_data["lora"]) > 0:
-                        print(
-                            f"{self.LORA_DIR} not exists, so we cant load loras!"
-                        )
+                        print(f"{self.LORA_DIR} not exists, so we cant load loras!")
 
             self.sj.clip.CLIP_stop_at_last_layers = clip_skip
 
-            if isinstance(self.controlnet, MultiControlNetModel) and isinstance(
-                    controlnet_conditioning_scale, float):
-                controlnet_conditioning_scale = [
-                    controlnet_conditioning_scale
-                ] * len(self.controlnet.nets)
+            if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+                controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
 
             # 3. Encode input prompt
             prompt_embeds, negative_prompt_embeds = self._encode_prompt(
                 prompts,
                 do_classifier_free_guidance,
                 negative_prompt,
-                num_inference_steps=num_inference_steps, )
+                num_inference_steps=num_inference_steps,
+            )
 
             # 4. Prepare timesteps
             self.scheduler.set_timesteps(num_inference_steps)
@@ -1021,127 +985,107 @@ def __call__(
                 width,
                 self.unet.dtype,
                 generator,
-                latents, )
+                latents,
+            )
 
             # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
             extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
             # 7. Denoising loop
-            num_warmup_steps = (
-                len(timesteps) - num_inference_steps * self.scheduler.order)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
             with self.progress_bar(total=num_inference_steps) as progress_bar:
                 for i, t in enumerate(timesteps):
                     current_control_step = i / len(timesteps)
                     step = i // self.scheduler.order
                     do_batch = False
-                    conds_list, cond_tensor = reconstruct_multicond_batch(
-                        prompt_embeds, step)
+                    conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
                     try:
                         weight = conds_list[0][0][1]
                     except Exception:
                         weight = 1.0
                     if do_classifier_free_guidance:
-                        uncond_tensor = reconstruct_cond_batch(
-                            negative_prompt_embeds, step)
-                        do_batch = cond_tensor.shape[1] == uncond_tensor.shape[
-                            1] and not isinstance(self.controlnet,
-                                                  MultiControlNetModel)
+                        uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
+                        do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1] and not isinstance(
+                            self.controlnet, MultiControlNetModel
+                        )
 
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat([latents] * 2)
-                                          if do_batch else latents)
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                     if do_batch:
-                        encoder_hidden_states = paddle.concat(
-                            [uncond_tensor, cond_tensor])
+                        encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
                         control_kwargs = {}
-                        if (enable_control and starting_control_step <
-                                current_control_step < ending_control_step):
-                            (
-                                down_block_res_samples,
-                                mid_block_res_sample,
-                            ) = self.controlnet(
+                        if enable_control and starting_control_step < current_control_step < ending_control_step:
+                            (down_block_res_samples, mid_block_res_sample,) = self.controlnet(
                                 latent_model_input,
                                 t,
                                 encoder_hidden_states=encoder_hidden_states,
                                 controlnet_cond=paddle.concat([image, image]),
                                 conditioning_scale=controlnet_conditioning_scale,
-                                return_dict=False, )
-                            control_kwargs[
-                                "down_block_additional_residuals"] = down_block_res_samples
-                            control_kwargs[
-                                "mid_block_additional_residual"] = mid_block_res_sample
+                                return_dict=False,
+                            )
+                            control_kwargs["down_block_additional_residuals"] = down_block_res_samples
+                            control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
                             encoder_hidden_states=encoder_hidden_states,
                             cross_attention_kwargs=cross_attention_kwargs,
-                            **control_kwargs, ).sample
+                            **control_kwargs,
+                        ).sample
                         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                         noise_pred = noise_pred_uncond + weight * guidance_scale * (
-                            noise_pred_text - noise_pred_uncond)
+                            noise_pred_text - noise_pred_uncond
+                        )
                     else:
                         control_kwargs = {}
-                        if (enable_control and starting_control_step <
-                                current_control_step < ending_control_step):
-                            (
-                                down_block_res_samples,
-                                mid_block_res_sample,
-                            ) = self.controlnet(
+                        if enable_control and starting_control_step < current_control_step < ending_control_step:
+                            (down_block_res_samples, mid_block_res_sample,) = self.controlnet(
                                 latent_model_input,
                                 t,
                                 encoder_hidden_states=cond_tensor,
                                 controlnet_cond=image,
                                 conditioning_scale=controlnet_conditioning_scale,
-                                return_dict=False, )
-                            control_kwargs[
-                                "down_block_additional_residuals"] = down_block_res_samples
-                            control_kwargs[
-                                "mid_block_additional_residual"] = mid_block_res_sample
+                                return_dict=False,
+                            )
+                            control_kwargs["down_block_additional_residuals"] = down_block_res_samples
+                            control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
                             encoder_hidden_states=cond_tensor,
                             cross_attention_kwargs=cross_attention_kwargs,
-                            **control_kwargs, ).sample
+                            **control_kwargs,
+                        ).sample
 
                         if do_classifier_free_guidance:
                             control_kwargs = {}
-                            if (enable_control and starting_control_step <
-                                    current_control_step < ending_control_step):
-                                (
-                                    down_block_res_samples,
-                                    mid_block_res_sample,
-                                ) = self.controlnet(
+                            if enable_control and starting_control_step < current_control_step < ending_control_step:
+                                (down_block_res_samples, mid_block_res_sample,) = self.controlnet(
                                     latent_model_input,
                                     t,
                                     encoder_hidden_states=uncond_tensor,
                                     controlnet_cond=image,
                                     conditioning_scale=controlnet_conditioning_scale,
-                                    return_dict=False, )
-                                control_kwargs[
-                                    "down_block_additional_residuals"] = down_block_res_samples
-                                control_kwargs[
-                                    "mid_block_additional_residual"] = mid_block_res_sample
+                                    return_dict=False,
+                                )
+                                control_kwargs["down_block_additional_residuals"] = down_block_res_samples
+                                control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
                             noise_pred_uncond = self.unet(
                                 latent_model_input,
                                 t,
                                 encoder_hidden_states=uncond_tensor,
                                 cross_attention_kwargs=cross_attention_kwargs,
-                                **control_kwargs, ).sample
-                            noise_pred = noise_pred_uncond + weight * guidance_scale * (
-                                noise_pred - noise_pred_uncond)
+                                **control_kwargs,
+                            ).sample
+                            noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
 
                     # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                     # call the callback, if provided
-                    if i == len(timesteps) - 1 or (
-                        (i + 1) > num_warmup_steps and
-                        (i + 1) % self.scheduler.order == 0):
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                         progress_bar.update()
                         if callback is not None and i % callback_steps == 0:
                             callback(i, t, latents)
@@ -1154,8 +1098,7 @@ def __call__(
                 image = self.decode_latents(latents)
 
                 # 9. Run safety checker
-                image, has_nsfw_concept = self.run_safety_checker(
-                    image, self.unet.dtype)
+                image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
 
                 # 10. Convert to PIL
                 image = self.numpy_to_pil(image)
@@ -1164,14 +1107,12 @@ def __call__(
                 image = self.decode_latents(latents)
 
                 # 9. Run safety checker
-                image, has_nsfw_concept = self.run_safety_checker(
-                    image, self.unet.dtype)
+                image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
 
             if not return_dict:
                 return (image, has_nsfw_concept)
 
-            return StableDiffusionPipelineOutput(
-                images=image, nsfw_content_detected=has_nsfw_concept)
+            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
         except Exception as e:
             raise ValueError(e)
         finally:
@@ -1215,12 +1156,7 @@ class FrozenCLIPEmbedder(nn.Layer):
 
     LAYERS = ["last", "pooled", "hidden"]
 
-    def __init__(self,
-                 text_encoder,
-                 tokenizer,
-                 freeze=True,
-                 layer="last",
-                 layer_idx=None):
+    def __init__(self, text_encoder, tokenizer, freeze=True, layer="last", layer_idx=None):
         super().__init__()
         assert layer in self.LAYERS
         self.tokenizer = tokenizer
@@ -1244,12 +1180,14 @@ def forward(self, text):
             truncation=True,
             max_length=self.tokenizer.model_max_length,
             padding="max_length",
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         tokens = batch_encoding["input_ids"]
         outputs = self.text_encoder(
             input_ids=tokens,
             output_hidden_states=self.layer == "hidden",
-            return_dict=True, )
+            return_dict=True,
+        )
         if self.layer == "last":
             z = outputs.last_hidden_state
         elif self.layer == "pooled":
@@ -1288,8 +1226,7 @@ def empty_chunk(self):
     def get_target_prompt_token_count(self, token_count):
         """returns the maximum number of tokens a prompt of a known length can have before it requires one more PromptChunk to be represented"""
 
-        return math.ceil(max(token_count, 1) /
-                         self.chunk_length) * self.chunk_length
+        return math.ceil(max(token_count, 1) / self.chunk_length) * self.chunk_length
 
     def tokenize(self, texts):
         """Converts a batch of texts into a batch of token ids"""
@@ -1370,10 +1307,12 @@ def next_chunk(is_last=False):
 
                 # this is when we are at the end of alloted 75 tokens for the current chunk, and the current token is not a comma. opts.comma_padding_backtrack
                 # is a setting that specifies that if there is a comma nearby, the text after the comma should be moved out of this chunk and into the next.
-                elif (WebUIStableDiffusionPipeline.comma_padding_backtrack != 0
-                      and len(chunk.tokens) == self.chunk_length and
-                      last_comma != -1 and len(chunk.tokens) - last_comma <=
-                      WebUIStableDiffusionPipeline.comma_padding_backtrack):
+                elif (
+                    WebUIStableDiffusionPipeline.comma_padding_backtrack != 0
+                    and len(chunk.tokens) == self.chunk_length
+                    and last_comma != -1
+                    and len(chunk.tokens) - last_comma <= WebUIStableDiffusionPipeline.comma_padding_backtrack
+                ):
                     break_location = last_comma + 1
 
                     reloc_tokens = chunk.tokens[break_location:]
@@ -1392,8 +1331,7 @@ def next_chunk(is_last=False):
                 (
                     embedding,
                     embedding_length_in_tokens,
-                ) = self.hijack.embedding_db.find_embedding_at_position(
-                    tokens, position)
+                ) = self.hijack.embedding_db.find_embedding_at_position(tokens, position)
                 if embedding is None:
                     chunk.tokens.append(token)
                     chunk.multipliers.append(weight)
@@ -1455,10 +1393,7 @@ def forward(self, texts):
 
         zs = []
         for i in range(chunk_count):
-            batch_chunk = [
-                chunks[i] if i < len(chunks) else self.empty_chunk()
-                for chunks in batch_chunks
-            ]
+            batch_chunk = [chunks[i] if i < len(chunks) else self.empty_chunk() for chunks in batch_chunks]
 
             tokens = [x.tokens for x in batch_chunk]
             multipliers = [x.multipliers for x in batch_chunk]
@@ -1472,10 +1407,9 @@ def forward(self, texts):
             zs.append(z)
 
         if len(used_embeddings) > 0:
-            embeddings_list = ", ".join([
-                f"{name} [{embedding.checksum()}]"
-                for name, embedding in used_embeddings.items()
-            ])
+            embeddings_list = ", ".join(
+                [f"{name} [{embedding.checksum()}]" for name, embedding in used_embeddings.items()]
+            )
             self.hijack.comments.append(f"Used embeddings: {embeddings_list}")
 
         return paddle.concat(zs, axis=1)
@@ -1494,15 +1428,19 @@ def process_tokens(self, remade_batch_tokens, batch_multipliers):
         if self.id_end != self.id_pad:
             for batch_pos in range(len(remade_batch_tokens)):
                 index = remade_batch_tokens[batch_pos].index(self.id_end)
-                tokens[batch_pos, index + 1:tokens.shape[1]] = self.id_pad
+                tokens[batch_pos, index + 1 : tokens.shape[1]] = self.id_pad
 
         z = self.encode_with_text_encoder(tokens)
 
         # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
         batch_multipliers = paddle.to_tensor(batch_multipliers)
         original_mean = z.mean()
-        z = z * batch_multipliers.reshape(batch_multipliers.shape +
-                                          [1, ]).expand(z.shape)
+        z = z * batch_multipliers.reshape(
+            batch_multipliers.shape
+            + [
+                1,
+            ]
+        ).expand(z.shape)
         new_mean = z.mean()
         z = z * (original_mean / new_mean)
 
@@ -1520,8 +1458,7 @@ def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1):
         self.comma_token = vocab.get(",</w>", None)
 
         self.token_mults = {}
-        tokens_with_parens = [(k, v) for k, v in vocab.items()
-                              if "(" in k or ")" in k or "[" in k or "]" in k]
+        tokens_with_parens = [(k, v) for k, v in vocab.items() if "(" in k or ")" in k or "[" in k or "]" in k]
         for text, ident in tokens_with_parens:
             mult = 1.0
             for c in text:
@@ -1542,8 +1479,7 @@ def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1):
         self.id_pad = self.id_end
 
     def tokenize(self, texts):
-        tokenized = self.wrapped.tokenizer(
-            texts, truncation=False, add_special_tokens=False)["input_ids"]
+        tokenized = self.wrapped.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
 
         return tokenized
 
@@ -1552,7 +1488,8 @@ def encode_with_text_encoder(self, tokens):
         outputs = self.wrapped.text_encoder(
             input_ids=tokens,
             output_hidden_states=output_hidden_states,
-            return_dict=True, )
+            return_dict=True,
+        )
 
         if output_hidden_states:
             z = outputs.hidden_states[-self.CLIP_stop_at_last_layers]
@@ -1564,11 +1501,9 @@ def encode_with_text_encoder(self, tokens):
 
     def encode_embedding_init_text(self, init_text, nvpt):
         embedding_layer = self.wrapped.text_encoder.text_model
-        ids = self.wrapped.tokenizer(
-            init_text,
-            max_length=nvpt,
-            return_tensors="pd",
-            add_special_tokens=False)["input_ids"]
+        ids = self.wrapped.tokenizer(init_text, max_length=nvpt, return_tensors="pd", add_special_tokens=False)[
+            "input_ids"
+        ]
         embedded = embedding_layer.token_embedding.wrapped(ids).squeeze(0)
 
         return embedded
@@ -1630,8 +1565,7 @@ def parse_prompts(prompts):
 
 class EmbeddingDecoder(json.JSONDecoder):
     def __init__(self, *args, **kwargs):
-        json.JSONDecoder.__init__(
-            self, object_hook=self.object_hook, *args, **kwargs)
+        json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
 
     def object_hook(self, d):
         if "TORCHTENSOR" in d:
@@ -1652,8 +1586,7 @@ def lcg(m=2**32, a=1664525, c=1013904223, seed=0):
 
 def xor_block(block):
     g = lcg()
-    randblock = (np.array([next(g) for _ in range(np.product(block.shape))])
-                 .astype(np.uint8).reshape(block.shape))
+    randblock = np.array([next(g) for _ in range(np.product(block.shape))]).astype(np.uint8).reshape(block.shape)
     return np.bitwise_xor(block.astype(np.uint8), randblock & 0x0F)
 
 
@@ -1667,16 +1600,17 @@ def crop_black(img, tol=0):
 
 def extract_image_data_embed(image):
     d = 3
-    outarr = (crop_black(
-        np.array(image.convert("RGB").getdata())
-        .reshape(image.size[1], image.size[0], d).astype(np.uint8)) & 0x0F)
+    outarr = (
+        crop_black(np.array(image.convert("RGB").getdata()).reshape(image.size[1], image.size[0], d).astype(np.uint8))
+        & 0x0F
+    )
     black_cols = np.where(np.sum(outarr, axis=(0, 2)) == 0)
     if black_cols[0].shape[0] < 2:
         print("No Image data blocks found.")
         return None
 
-    data_block_lower = outarr[:, :black_cols[0].min(), :].astype(np.uint8)
-    data_block_upper = outarr[:, black_cols[0].max() + 1:, :].astype(np.uint8)
+    data_block_lower = outarr[:, : black_cols[0].min(), :].astype(np.uint8)
+    data_block_upper = outarr[:, black_cols[0].max() + 1 :, :].astype(np.uint8)
 
     data_block_lower = xor_block(data_block_lower)
     data_block_upper = xor_block(data_block_upper)
@@ -1703,7 +1637,8 @@ def extract_image_data_embed(image):
 # [75, 'fantasy landscape with a lake and an oak in background masterful']
 # [100, 'fantasy landscape with a lake and a christmas tree in background masterful']
 
-schedule_parser = lark.Lark(r"""
+schedule_parser = lark.Lark(
+    r"""
 !start: (prompt | /[][():]/+)*
 prompt: (emphasized | scheduled | alternate | plain | WHITESPACE)*
 !emphasized: "(" prompt ")"
@@ -1714,7 +1649,8 @@ def extract_image_data_embed(image):
 WHITESPACE: /\s+/
 plain: /([^\\\[\]():|]|\\.)+/
 %import common.SIGNED_NUMBER -> NUMBER
-""")
+"""
+)
 
 
 def get_learned_conditioning_prompt_schedules(prompts, steps):
@@ -1806,8 +1742,7 @@ def get_schedule(prompt):
     return [promptdict[prompt] for prompt in prompts]
 
 
-ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning",
-                                         ["end_at_step", "cond"])
+ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning", ["end_at_step", "cond"])
 
 
 def get_learned_conditioning(model, prompts, steps):
@@ -1845,8 +1780,7 @@ def get_learned_conditioning(model, prompts, steps):
 
         cond_schedule = []
         for i, (end_at_step, text) in enumerate(prompt_schedule):
-            cond_schedule.append(
-                ScheduledPromptConditioning(end_at_step, conds[i]))
+            cond_schedule.append(ScheduledPromptConditioning(end_at_step, conds[i]))
 
         cache[prompt] = cond_schedule
         res.append(cond_schedule)
@@ -1871,8 +1805,7 @@ def get_multicond_prompt_list(prompts):
         for subprompt in subprompts:
             match = re_weight.search(subprompt)
 
-            text, weight = match.groups() if match is not None else (subprompt,
-                                                                     1.0)
+            text, weight = match.groups() if match is not None else (subprompt, 1.0)
 
             weight = float(weight) if weight is not None else 1.0
 
@@ -1897,43 +1830,37 @@ def __init__(self, schedules, weight=1.0):
 
 class MulticondLearnedConditioning:
     def __init__(self, shape, batch):
-        self.shape: tuple = (
-            shape  # the shape field is needed to send this object to DDIM/PLMS
-        )
+        self.shape: tuple = shape  # the shape field is needed to send this object to DDIM/PLMS
         self.batch: List[List[ComposableScheduledPromptConditioning]] = batch
 
 
-def get_multicond_learned_conditioning(model, prompts,
-                                       steps) -> MulticondLearnedConditioning:
+def get_multicond_learned_conditioning(model, prompts, steps) -> MulticondLearnedConditioning:
     """same as get_learned_conditioning, but returns a list of ScheduledPromptConditioning along with the weight objects for each prompt.
     For each prompt, the list is obtained by splitting the prompt using the AND separator.
 
     https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/
     """
 
-    res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(
-        prompts)
+    res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(prompts)
 
-    learned_conditioning = get_learned_conditioning(model, prompt_flat_list,
-                                                    steps)
+    learned_conditioning = get_learned_conditioning(model, prompt_flat_list, steps)
 
     res = []
     for indexes in res_indexes:
-        res.append([
-            ComposableScheduledPromptConditioning(learned_conditioning[i],
-                                                  weight)
-            for i, weight in indexes
-        ])
+        res.append([ComposableScheduledPromptConditioning(learned_conditioning[i], weight) for i, weight in indexes])
 
-    return MulticondLearnedConditioning(shape=(len(prompts), ), batch=res)
+    return MulticondLearnedConditioning(shape=(len(prompts),), batch=res)
 
 
-def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]],
-                           current_step):
+def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]], current_step):
     param = c[0][0].cond
     res = paddle.zeros(
-        [len(c), ] + param.shape,
-        dtype=param.dtype, )
+        [
+            len(c),
+        ]
+        + param.shape,
+        dtype=param.dtype,
+    )
     for i, cond_schedule in enumerate(c):
         target_index = 0
         for current, (end_at, cond) in enumerate(cond_schedule):
@@ -1956,8 +1883,7 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
 
         for cond_index, composable_prompt in enumerate(composable_prompts):
             target_index = 0
-            for current, (end_at,
-                          cond) in enumerate(composable_prompt.schedules):
+            for current, (end_at, cond) in enumerate(composable_prompt.schedules):
                 if current_step <= end_at:
                     target_index = current
                     break
@@ -1973,10 +1899,8 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
     for i in range(len(tensors)):
         if tensors[i].shape[0] != token_count:
             last_vector = tensors[i][-1:]
-            last_vector_repeated = last_vector.tile(
-                [token_count - tensors[i].shape[0], 1])
-            tensors[i] = paddle.concat(
-                [tensors[i], last_vector_repeated], axis=0)
+            last_vector_repeated = last_vector.tile([token_count - tensors[i].shape[0], 1])
+            tensors[i] = paddle.concat([tensors[i], last_vector_repeated], axis=0)
 
     return conds_list, paddle.stack(tensors).cast(dtype=param.dtype)
 
@@ -1997,7 +1921,8 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
 [^\\()\[\]:]+|
 :
 """,
-    re.X, )
+    re.X,
+)
 
 re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
 
@@ -2102,15 +2027,12 @@ class StableDiffusionModelHijack:
     layers = None
     circular_enabled = False
 
-    def __init__(self,
-                 clip_model,
-                 embeddings_dir=None,
-                 CLIP_stop_at_last_layers=-1):
+    def __init__(self, clip_model, embeddings_dir=None, CLIP_stop_at_last_layers=-1):
         model_embeddings = clip_model.text_encoder.text_model
-        model_embeddings.token_embedding = EmbeddingsWithFixes(
-            model_embeddings.token_embedding, self)
+        model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
         clip_model = FrozenCLIPEmbedderWithCustomWords(
-            clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers)
+            clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers
+        )
 
         self.embedding_db = EmbeddingDatabase(clip_model)
         self.embedding_db.add_embedding_dir(embeddings_dir)
@@ -2148,8 +2070,7 @@ def forward(self, input_ids):
 
         inputs_embeds = self.wrapped(input_ids)
 
-        if (batch_fixes is None or len(batch_fixes) == 0 or
-                max([len(x) for x in batch_fixes]) == 0):
+        if batch_fixes is None or len(batch_fixes) == 0 or max([len(x) for x in batch_fixes]) == 0:
             return inputs_embeds
 
         vecs = []
@@ -2157,11 +2078,13 @@ def forward(self, input_ids):
             for offset, embedding in fixes:
                 emb = embedding.vec.cast(self.wrapped.dtype)
                 emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0])
-                tensor = paddle.concat([
-                    tensor[0:offset + 1],
-                    emb[0:emb_len],
-                    tensor[offset + 1 + emb_len:],
-                ])
+                tensor = paddle.concat(
+                    [
+                        tensor[0 : offset + 1],
+                        emb[0:emb_len],
+                        tensor[offset + 1 + emb_len :],
+                    ]
+                )
 
             vecs.append(tensor)
 
@@ -2190,12 +2113,8 @@ def __init__(self, vec, name, step=None):
 
     def save(self, filename):
         embedding_data = {
-            "string_to_token": {
-                "*": 265
-            },
-            "string_to_param": {
-                "*": self.vec
-            },
+            "string_to_token": {"*": 265},
+            "string_to_param": {"*": self.vec},
             "name": self.name,
             "step": self.step,
             "sd_checkpoint": self.sd_checkpoint,
@@ -2267,7 +2186,8 @@ def register_embedding(self, embedding, model):
         self.ids_lookup[first_id] = sorted(
             self.ids_lookup[first_id] + [(ids, embedding)],
             key=lambda x: len(x[0]),
-            reverse=True, )
+            reverse=True,
+        )
 
         return embedding
 
@@ -2285,8 +2205,7 @@ def load_from_file(self, path, filename):
                 return
 
             embed_image = Image.open(path)
-            if hasattr(embed_image,
-                       "text") and "sd-ti-embedding" in embed_image.text:
+            if hasattr(embed_image, "text") and "sd-ti-embedding" in embed_image.text:
                 data = embedding_from_b64(embed_image.text["sd-ti-embedding"])
                 name = data.get("name", name)
             else:
@@ -2308,14 +2227,11 @@ def load_from_file(self, path, filename):
             param_dict = data["string_to_param"]
             if hasattr(param_dict, "_parameters"):
                 param_dict = getattr(param_dict, "_parameters")
-            assert len(
-                param_dict) == 1, "embedding file has multiple terms in it"
+            assert len(param_dict) == 1, "embedding file has multiple terms in it"
             emb = next(iter(param_dict.items()))[1]
         # diffuser concepts
-        elif type(data) == dict and type(next(iter(data.values(
-        )))) == paddle.Tensor:
-            assert len(data.keys(
-            )) == 1, "embedding file has multiple terms in it"
+        elif type(data) == dict and type(next(iter(data.values()))) == paddle.Tensor:
+            assert len(data.keys()) == 1, "embedding file has multiple terms in it"
 
             emb = next(iter(data.values()))
             if len(emb.shape) == 1:
@@ -2387,7 +2303,8 @@ def load_textual_inversion_embeddings(self, force_reload=False):
 
         displayed_embeddings = (
             tuple(self.word_embeddings.keys()),
-            tuple(self.skipped_embeddings.keys()), )
+            tuple(self.skipped_embeddings.keys()),
+        )
         if self.previously_displayed_embeddings != displayed_embeddings:
             self.previously_displayed_embeddings = displayed_embeddings
             print(
@@ -2406,7 +2323,7 @@ def find_embedding_at_position(self, tokens, offset):
             return None, None
 
         for ids, embedding in possible_matches:
-            if tokens[offset:offset + len(ids)] == ids:
+            if tokens[offset : offset + len(ids)] == ids:
                 return embedding, len(ids)
 
         return None, None
diff --git a/ppdiffusers/examples/community/wildcard_stable_diffusion.py b/ppdiffusers/examples/community/wildcard_stable_diffusion.py
index 80eb36c2a700c..93ad2d40a130a 100644
--- a/ppdiffusers/examples/community/wildcard_stable_diffusion.py
+++ b/ppdiffusers/examples/community/wildcard_stable_diffusion.py
@@ -21,18 +21,18 @@
 from typing import Callable, Dict, List, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import \
-    StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler,
-                                    PNDMScheduler)
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipelineOutput,
+)
+from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ppdiffusers.utils import deprecate, logging
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -50,8 +50,7 @@ def read_wildcard_values(path: str):
         return f.read().splitlines()
 
 
-def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]]={},
-                         wildcard_files: List[str]=[]):
+def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []):
     for wildcard_file in wildcard_files:
         filename = get_filename(wildcard_file)
         read_values = read_wildcard_values(wildcard_file)
@@ -62,19 +61,18 @@ def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]]={},
 
 
 def replace_prompt_with_wildcards(
-        prompt: str,
-        wildcard_option_dict: Dict[str, List[str]]={},
-        wildcard_files: List[str]=[], ):
+    prompt: str,
+    wildcard_option_dict: Dict[str, List[str]] = {},
+    wildcard_files: List[str] = [],
+):
     new_prompt = prompt
 
     # get wildcard options
-    wildcard_option_dict = grab_wildcard_values(wildcard_option_dict,
-                                                wildcard_files)
+    wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files)
 
     for m in global_re_wildcard.finditer(new_prompt):
         wildcard_value = m.group()
-        replace_value = random.choice(wildcard_option_dict[wildcard_value.strip(
-            "__")])
+        replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")])
         new_prompt = new_prompt.replace(wildcard_value, replace_value, 1)
 
     return new_prompt
@@ -125,31 +123,27 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
@@ -171,29 +165,31 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            height: int=512,
-            width: int=512,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            seed: Optional[int]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            wildcard_option_dict: Dict[str, List[str]]={},
-            wildcard_files: List[str]=[],
-            num_prompt_samples: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        seed: Optional[int] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        wildcard_option_dict: Dict[str, List[str]] = {},
+        wildcard_files: List[str] = [],
+        num_prompt_samples: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
@@ -254,8 +250,7 @@ def __call__(
 
         if isinstance(prompt, str):
             prompt = [
-                replace_prompt_with_wildcards(prompt, wildcard_option_dict,
-                                              wildcard_files)
+                replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files)
                 for i in range(num_prompt_samples)
             ]
             batch_size = len(prompt)
@@ -263,52 +258,46 @@ def __call__(
             prompt_list = []
             for p in prompt:
                 for i in range(num_prompt_samples):
-                    prompt_list.append(
-                        replace_prompt_with_wildcards(p, wildcard_option_dict,
-                                                      wildcard_files))
+                    prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files))
             prompt = prompt_list
             batch_size = len(prompt)
         else:
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         # get prompt text embeddings
         text_inputs = self.tokenizer(
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
 
         if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(
-                text_input_ids[:, self.tokenizer.model_max_length:])
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         attention_mask = paddle.ones_like(text_input_ids)
-        text_embeddings = self.text_encoder(
-            text_input_ids, attention_mask=attention_mask)[0]
+        text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
         text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -322,14 +311,16 @@ def __call__(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -339,23 +330,20 @@ def __call__(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = paddle.ones_like(uncond_input.input_ids)
-            uncond_embeddings = self.text_encoder(
-                uncond_input.input_ids, attention_mask=attention_mask)[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile(
-                [batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            text_embeddings = paddle.concat(
-                [uncond_embeddings, text_embeddings])
+            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
 
         # get the initial random noise unless the user supplied it
 
@@ -375,9 +363,7 @@ def __call__(
             latents = paddle.randn(latents_shape, dtype=latents_dtype)
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
             latents = latents
 
         # set timesteps
@@ -394,33 +380,26 @@ def __call__(
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input, t,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -435,12 +414,11 @@ def __call__(
         image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.astype(
-                    text_embeddings.dtype), )
+                clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype),
+            )
         else:
             has_nsfw_concept = None
 
@@ -450,7 +428,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return WildcardStableDiffusionOutput(
-            images=image,
-            nsfw_content_detected=has_nsfw_concept,
-            prompts=prompt)
+        return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt)
diff --git a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
index bd00e8dcc89f6..2088a37dbd9a5 100644
--- a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
@@ -27,133 +27,60 @@ def __init__(self, model_path=None):
         super().__init__()
 
         self.netVggOne = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=3,
-                out_channels=64,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.Conv2D(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=64,
-                out_channels=64,
-                kernel_size=3,
-                stride=1,
-                padding=1),
-            paddle.nn.ReLU(), )
+            paddle.nn.Conv2D(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+            paddle.nn.ReLU(),
+        )
 
         self.netVggTwo = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(
-                kernel_size=2, stride=2),
-            paddle.nn.Conv2D(
-                in_channels=64,
-                out_channels=128,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+            paddle.nn.Conv2D(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=128,
-                out_channels=128,
-                kernel_size=3,
-                stride=1,
-                padding=1),
-            paddle.nn.ReLU(), )
+        )
 
         self.netVggThr = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(
-                kernel_size=2, stride=2),
-            paddle.nn.Conv2D(
-                in_channels=128,
-                out_channels=256,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+            paddle.nn.Conv2D(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=256,
-                out_channels=256,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=256,
-                out_channels=256,
-                kernel_size=3,
-                stride=1,
-                padding=1),
-            paddle.nn.ReLU(), )
+        )
 
         self.netVggFou = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(
-                kernel_size=2, stride=2),
-            paddle.nn.Conv2D(
-                in_channels=256,
-                out_channels=512,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+            paddle.nn.Conv2D(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=512,
-                out_channels=512,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=512,
-                out_channels=512,
-                kernel_size=3,
-                stride=1,
-                padding=1),
-            paddle.nn.ReLU(), )
+            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            paddle.nn.ReLU(),
+        )
 
         self.netVggFiv = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(
-                kernel_size=2, stride=2),
-            paddle.nn.Conv2D(
-                in_channels=512,
-                out_channels=512,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=512,
-                out_channels=512,
-                kernel_size=3,
-                stride=1,
-                padding=1),
+            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
             paddle.nn.ReLU(),
-            paddle.nn.Conv2D(
-                in_channels=512,
-                out_channels=512,
-                kernel_size=3,
-                stride=1,
-                padding=1),
-            paddle.nn.ReLU(), )
-
-        self.netScoreOne = paddle.nn.Conv2D(
-            in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreTwo = paddle.nn.Conv2D(
-            in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreThr = paddle.nn.Conv2D(
-            in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreFou = paddle.nn.Conv2D(
-            in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreFiv = paddle.nn.Conv2D(
-            in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
+        )
+
+        self.netScoreOne = paddle.nn.Conv2D(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreTwo = paddle.nn.Conv2D(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreThr = paddle.nn.Conv2D(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreFou = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreFiv = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
 
         self.netCombine = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=5,
-                out_channels=1,
-                kernel_size=1,
-                stride=1,
-                padding=0),
-            paddle.nn.Sigmoid(), )
+            paddle.nn.Conv2D(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0),
+            paddle.nn.Sigmoid(),
+        )
 
         if model_path:
             self.set_state_dict(paddle.load(model_path))
@@ -162,7 +89,8 @@ def forward(self, tenInput):
         tenInput = tenInput * 255.0
         tenInput = tenInput - paddle.to_tensor(
             [104.00698793, 116.66876762, 122.67891434],
-            dtype=tenInput.dtype, ).reshape([1, 3, 1, 1])
+            dtype=tenInput.dtype,
+        ).reshape([1, 3, 1, 1])
 
         tenVggOne = self.netVggOne(tenInput)
         tenVggTwo = self.netVggTwo(tenVggOne)
@@ -180,47 +108,48 @@ def forward(self, tenInput):
             tenScoreOne,
             size=(tenInput.shape[2], tenInput.shape[3]),
             mode="bilinear",
-            align_corners=False, )
+            align_corners=False,
+        )
         tenScoreTwo = paddle.nn.functional.interpolate(
             tenScoreTwo,
             size=(tenInput.shape[2], tenInput.shape[3]),
             mode="bilinear",
-            align_corners=False, )
+            align_corners=False,
+        )
         tenScoreThr = paddle.nn.functional.interpolate(
             tenScoreThr,
             size=(tenInput.shape[2], tenInput.shape[3]),
             mode="bilinear",
-            align_corners=False, )
+            align_corners=False,
+        )
         tenScoreFou = paddle.nn.functional.interpolate(
             tenScoreFou,
             size=(tenInput.shape[2], tenInput.shape[3]),
             mode="bilinear",
-            align_corners=False, )
+            align_corners=False,
+        )
         tenScoreFiv = paddle.nn.functional.interpolate(
             tenScoreFiv,
             size=(tenInput.shape[2], tenInput.shape[3]),
             mode="bilinear",
-            align_corners=False, )
+            align_corners=False,
+        )
 
-        return self.netCombine(
-            paddle.concat([
-                tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv
-            ], 1))
+        return self.netCombine(paddle.concat([tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv], 1))
 
 
-remote_model_path = "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams"
+remote_model_path = (
+    "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams"
+)
 
 
 class HEDdetector:
     def __init__(self, modelpath=None):
-        modelpath = os.path.join(annotator_ckpts_path,
-                                 "network-bsds500.pdparams")
+        modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pdparams")
         if not os.path.exists(modelpath):
-            from paddlenlp.utils.downloader import \
-                get_path_from_url_with_filelock
+            from paddlenlp.utils.downloader import get_path_from_url_with_filelock
 
-            get_path_from_url_with_filelock(
-                remote_model_path, root_dir=annotator_ckpts_path)
+            get_path_from_url_with_filelock(remote_model_path, root_dir=annotator_ckpts_path)
         self.model_path = modelpath
         self.netNetwork = Network(modelpath)
         self.netNetwork.eval()
diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
index ecd0bf926d74d..543d0774c523a 100644
--- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
@@ -44,7 +44,6 @@ def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
         x[depth_pt < bg_th] = 0
         y[depth_pt < bg_th] = 0
         normal = np.stack([x, y, z], axis=2)
-        normal /= np.sum(normal**2.0, axis=2, keepdims=True)**0.5
-        normal_image = (normal * 127.5 + 127.5).clip(
-            min=0, max=255).astype(np.uint8)
+        normal /= np.sum(normal**2.0, axis=2, keepdims=True) ** 0.5
+        normal_image = (normal * 127.5 + 127.5).clip(min=0, max=255).astype(np.uint8)
         return depth_image, normal_image
diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
index f93fa96d31b20..4726391519074 100644
--- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
+++ b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
@@ -19,25 +19,23 @@
 
 
 def checkmodel(model_dir, model_name):
-    if not os.path.exists(
-            os.path.join(model_dir, model_name, model_name + ".pdmodel")):
+    if not os.path.exists(os.path.join(model_dir, model_name, model_name + ".pdmodel")):
         model_url = "https://bj.bcebos.com/v1/paddledet/models/dpt_hybrid.zip"
         get_path_from_url_with_filelock(model_url, root_dir=model_dir)
 
 
 class MidasInference:
     def __init__(
-            self,
-            model_dir,
-            model_name="dpt_hybrid",
-            batchsize=8,
-            device="GPU",
-            run_mode="paddle", ):
+        self,
+        model_dir,
+        model_name="dpt_hybrid",
+        batchsize=8,
+        device="GPU",
+        run_mode="paddle",
+    ):
         checkmodel(model_dir, model_name)
-        model_file = os.path.join(model_dir, model_name,
-                                  model_name + ".pdmodel")
-        params_file = os.path.join(model_dir, model_name,
-                                   model_name + ".pdiparams")
+        model_file = os.path.join(model_dir, model_name, model_name + ".pdmodel")
+        params_file = os.path.join(model_dir, model_name, model_name + ".pdiparams")
         config = paddle_infer.Config(model_file, params_file)
         self.batchsize = batchsize
         if device == "GPU":
@@ -69,12 +67,12 @@ def __init__(
                 min_subgraph_size=3,
                 precision_mode=precision_map[run_mode],
                 use_static=False,
-                use_calib_mode=False, )
+                use_calib_mode=False,
+            )
             min_input_shape = {"image": [1, 3, 224, 224]}
             max_input_shape = {"image": [1, 3, 1280, 1280]}
             opt_input_shape = {"image": [1, 3, 384, 384]}
-            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
-                                              opt_input_shape)
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
 
         # disable print log when predict
         config.disable_glog_info()
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
index 8e9d9e35206a6..8e453eef33c28 100644
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
@@ -27,13 +27,11 @@
 
 class MLSDdetector:
     def __init__(self):
-        model_path = os.path.join(annotator_ckpts_path,
-                                  "mlsd_large_512_fp32.pdparams")
+        model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pdparams")
         if not os.path.exists(model_path):
             from basicsr.utils.download_util import load_file_from_url
 
-            load_file_from_url(
-                remote_model_path, model_dir=annotator_ckpts_path)
+            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
         self.model = MobileV2_MLSD_Large()
         self.model.eval()
         self.model.set_dict(paddle.load(model_path))
@@ -43,10 +41,8 @@ def __call__(self, input_image, thr_v, thr_d):
         img = input_image
         img_output = np.zeros_like(img)
         with paddle.no_grad():
-            lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]],
-                               thr_v, thr_d)
+            lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
             for line in lines:
                 x_start, y_start, x_end, y_end = [int(val) for val in line]
-                cv2.line(img_output, (x_start, y_start), (x_end, y_end),
-                         [255, 255, 255], 1)
+                cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
         return img_output[:, :, (0)]
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
index c1f08257cff39..d9123b0102d3c 100644
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
+++ b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
@@ -20,35 +20,36 @@ class BlockTypeA(paddle.nn.Layer):
     def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale=True):
         super(BlockTypeA, self).__init__()
         self.conv1 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=in_c2, out_channels=out_c2, kernel_size=1),
+            paddle.nn.Conv2D(in_channels=in_c2, out_channels=out_c2, kernel_size=1),
             paddle.nn.BatchNorm2D(
                 num_features=out_c2,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU(), )
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU(),
+        )
         self.conv2 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=in_c1, out_channels=out_c1, kernel_size=1),
+            paddle.nn.Conv2D(in_channels=in_c1, out_channels=out_c1, kernel_size=1),
             paddle.nn.BatchNorm2D(
                 num_features=out_c1,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU(), )
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU(),
+        )
         self.upscale = upscale
 
     def forward(self, a, b):
         b = self.conv1(b)
         a = self.conv2(a)
         if self.upscale:
-            b = paddle.nn.functional.interpolate(
-                x=b, scale_factor=2.0, mode="bilinear", align_corners=True)
+            b = paddle.nn.functional.interpolate(x=b, scale_factor=2.0, mode="bilinear", align_corners=True)
         return paddle.concat(x=(a, b), axis=1)
 
 
@@ -56,27 +57,29 @@ class BlockTypeB(paddle.nn.Layer):
     def __init__(self, in_c, out_c):
         super(BlockTypeB, self).__init__()
         self.conv1 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
+            paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
             paddle.nn.BatchNorm2D(
                 num_features=in_c,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU(), )
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU(),
+        )
         self.conv2 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1),
+            paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1),
             paddle.nn.BatchNorm2D(
                 num_features=out_c,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU(), )
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU(),
+        )
 
     def forward(self, x):
         x = self.conv1(x) + x
@@ -93,28 +96,31 @@ def __init__(self, in_c, out_c):
                 out_channels=in_c,
                 kernel_size=3,
                 padding=5,
-                dilation=5, ),
+                dilation=5,
+            ),
             paddle.nn.BatchNorm2D(
                 num_features=in_c,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU(), )
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU(),
+        )
         self.conv2 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(
-                in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
+            paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
             paddle.nn.BatchNorm2D(
                 num_features=in_c,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU(), )
-        self.conv3 = paddle.nn.Conv2D(
-            in_channels=in_c, out_channels=out_c, kernel_size=1)
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU(),
+        )
+        self.conv3 = paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=1)
 
     def forward(self, x):
         x = self.conv1(x)
@@ -143,8 +149,7 @@ def _make_divisible(v, divisor, min_value=None):
 
 
 class ConvBNReLU(paddle.nn.Sequential):
-    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1,
-                 groups=1):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
         self.channel_pad = out_planes - in_planes
         self.stride = stride
         if stride == 2:
@@ -159,23 +164,23 @@ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1,
                 stride=stride,
                 padding=padding,
                 groups=groups,
-                bias_attr=False, ),
+                bias_attr=False,
+            ),
             paddle.nn.BatchNorm2D(
                 num_features=out_planes,
                 momentum=1 - 0.1,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
-                use_global_stats=True, ),
-            paddle.nn.ReLU6(), )
+                use_global_stats=True,
+            ),
+            paddle.nn.ReLU6(),
+        )
         self.max_pool = paddle.nn.MaxPool2D(kernel_size=stride, stride=stride)
 
     def forward(self, x):
         if self.stride == 2:
-            x = paddle.nn.functional.pad(x=x,
-                                         pad=(0, 1, 0, 1),
-                                         mode="constant",
-                                         value=0)
+            x = paddle.nn.functional.pad(x=x, pad=(0, 1, 0, 1), mode="constant", value=0)
         for module in self:
             if not isinstance(module, paddle.nn.MaxPool2D):
                 x = module(x)
@@ -192,24 +197,27 @@ def __init__(self, inp, oup, stride, expand_ratio):
         layers = []
         if expand_ratio != 1:
             layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
-        layers.extend([
-            ConvBNReLU(
-                hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
-            paddle.nn.Conv2D(
-                in_channels=hidden_dim,
-                out_channels=oup,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias_attr=False, ),
-            paddle.nn.BatchNorm2D(
-                num_features=oup,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True, ),
-        ])
+        layers.extend(
+            [
+                ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+                paddle.nn.Conv2D(
+                    in_channels=hidden_dim,
+                    out_channels=oup,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias_attr=False,
+                ),
+                paddle.nn.BatchNorm2D(
+                    num_features=oup,
+                    momentum=1 - 0.1,
+                    epsilon=1e-05,
+                    weight_attr=None,
+                    bias_attr=None,
+                    use_global_stats=True,
+                ),
+            ]
+        )
         self.conv = paddle.nn.Sequential(*layers)
 
     def forward(self, x):
@@ -244,23 +252,20 @@ def __init__(self):
             [6, 64, 4, 2],
             [6, 96, 3, 1],
         ]
-        if (len(inverted_residual_setting) == 0 or
-                len(inverted_residual_setting[0]) != 4):
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
             raise ValueError(
-                "inverted_residual_setting should be non-empty or a 4-element list, got {}".
-                format(inverted_residual_setting))
-        input_channel = _make_divisible(input_channel * width_mult,
-                                        round_nearest)
-        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult),
-                                            round_nearest)
+                "inverted_residual_setting should be non-empty or a 4-element list, got {}".format(
+                    inverted_residual_setting
+                )
+            )
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
         features = [ConvBNReLU(4, input_channel, stride=2)]
         for t, c, n, s in inverted_residual_setting:
             output_channel = _make_divisible(c * width_mult, round_nearest)
             for i in range(n):
                 stride = s if i == 0 else 1
-                features.append(
-                    block(
-                        input_channel, output_channel, stride, expand_ratio=t))
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
                 input_channel = output_channel
         self.features = paddle.nn.Sequential(*features)
         self.fpn_selected = [1, 3, 6, 10, 13]
@@ -295,8 +300,7 @@ class MobileV2_MLSD_Large(paddle.nn.Layer):
     def __init__(self):
         super(MobileV2_MLSD_Large, self).__init__()
         self.backbone = MobileNetV2()
-        self.block15 = BlockTypeA(
-            in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False)
+        self.block15 = BlockTypeA(in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False)
         self.block16 = BlockTypeB(128, 64)
         self.block17 = BlockTypeA(in_c1=32, in_c2=64, out_c1=64, out_c2=64)
         self.block18 = BlockTypeB(128, 64)
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
index e533433631fb1..1ad8429e69fb9 100644
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
@@ -17,6 +17,7 @@
 import cv2
 import numpy as np
 import paddle
+
 """
 M-LSD
 Copyright 2021-present NAVER Corp.
@@ -48,11 +49,7 @@ def zeros_(tensor):
     return _no_grad_fill_(tensor, 0)
 
 
-def kaiming_normal_(tensor,
-                    a=0,
-                    mode="fan_in",
-                    nonlinearity="leaky_relu",
-                    reverse=False):
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
     """
     Modified tensor inspace using kaiming_normal_
     Args:
@@ -100,13 +97,11 @@ def _calculate_gain(nonlinearity, param=None):
     elif nonlinearity == "leaky_relu":
         if param is None:
             negative_slope = 0.01
-        elif (not isinstance(param, bool) and isinstance(param, int) or
-              isinstance(param, float)):
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
             # True/False are instances of int, hence check above
             negative_slope = param
         else:
-            raise ValueError("negative_slope {} not a valid number".format(
-                param))
+            raise ValueError("negative_slope {} not a valid number".format(param))
         return math.sqrt(2.0 / (1 + negative_slope**2))
     elif nonlinearity == "selu":
         return 3.0 / 4
@@ -119,8 +114,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False):
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
-        raise ValueError("Mode {} not supported, please use one of {}".format(
-            mode, valid_modes))
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
 
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
 
@@ -137,9 +131,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False):
         Tuple[fan_in, fan_out]
     """
     if tensor.ndim < 2:
-        raise ValueError(
-            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
-        )
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
 
     if reverse:
         num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
@@ -168,8 +160,8 @@ def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
     center = tpMap[:, (0), :, :]
     heat = paddle.nn.functional.sigmoid(x=center).unsqueeze(0)
     hmax = paddle.nn.functional.max_pool2d(
-        kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2,
-        x=heat).squeeze(0)
+        kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2, x=heat
+    ).squeeze(0)
     keep = (hmax == heat).astype(dtype="float32")
     heat = heat * keep
     heat = heat.reshape([-1])
@@ -185,21 +177,16 @@ def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
     return ptss, scores, displacement
 
 
-def pred_lines(image,
-               model,
-               input_shape=[512, 512],
-               score_thr=0.1,
-               dist_thr=20.0):
+def pred_lines(image, model, input_shape=[512, 512], score_thr=0.1, dist_thr=20.0):
     h, w, _ = image.shape
     h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
     resized_image = np.concatenate(
         [
-            cv2.resize(
-                image, (input_shape[1], input_shape[0]),
-                interpolation=cv2.INTER_AREA),
+            cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
             np.ones([input_shape[0], input_shape[1], 1]),
         ],
-        axis=-1, )
+        axis=-1,
+    )
     resized_image = resized_image.transpose((2, 0, 1))
     batch_image = np.expand_dims(resized_image, axis=0).astype("float32")
     batch_image = batch_image / 127.5 - 1.0
@@ -208,14 +195,13 @@ def pred_lines(image,
     pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
     start = vmap[:, :, :2]
     end = vmap[:, :, 2:]
-    dist_map = np.sqrt(np.sum((start - end)**2, axis=-1))
+    dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
     segments_list = []
     for center, score in zip(pts, pts_score):
         y, x = center
         distance = dist_map[y, x]
         if score > score_thr and distance > dist_thr:
-            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), (
-                x), :]
+            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), (x), :]
             x_start = x + disp_x_start
             y_start = y + disp_y_start
             x_end = x + disp_x_end
diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
index 7dc16bd999550..e07f249e8c9fe 100644
--- a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
@@ -33,25 +33,19 @@ def __call__(self, oriImg, hand=False):
         with paddle.no_grad():
             canvas = oriImg[:, :, ::-1].copy()
             canvas.fill(0)
-            result = self.body_estimation.predict(
-                oriImg, save_path="saved_images", visualization=False)
-            canvas = self.body_estimation.draw_pose(canvas, result["candidate"],
-                                                    result["subset"])
+            result = self.body_estimation.predict(oriImg, save_path="saved_images", visualization=False)
+            canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
             if hand:
-                hands_list = util.hand_detect(result["candidate"],
-                                              result["subset"], oriImg)
+                hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg)
                 all_hand_peaks = []
                 for x, y, w, is_left in hands_list:
                     scale_search = [0.5, 1.0, 1.5, 2.0]
                     peaks = self.hand_estimation.hand_estimation(
-                        oriImg[y:y + w, x:x + w, :], scale_search=scale_search)
-                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0],
-                                           peaks[:, 0] + x)
-                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1],
-                                           peaks[:, 1] + y)
+                        oriImg[y : y + w, x : x + w, :], scale_search=scale_search
+                    )
+                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
+                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
                     all_hand_peaks.append(peaks)
                 canvas = self.hand_estimation.draw_hand(canvas, all_hand_peaks)
 
-            return canvas, dict(
-                candidate=result["candidate"].tolist(),
-                subset=result["subset"].tolist())
+            return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist())
diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/util.py b/ppdiffusers/examples/controlnet/annotator/openpose/util.py
index 10028380bbd8a..899e38121eaea 100644
--- a/ppdiffusers/examples/controlnet/annotator/openpose/util.py
+++ b/ppdiffusers/examples/controlnet/annotator/openpose/util.py
@@ -47,8 +47,7 @@ def pad_right_down_corner(img, stride, padValue):
 def transfer(model, model_weights):
     transfered_model_weights = {}
     for weights_name in model.state_dict().keys():
-        transfered_model_weights[weights_name] = model_weights[".".join(
-            weights_name.split(".")[1:])]
+        transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])]
     return transfered_model_weights
 
 
@@ -114,11 +113,9 @@ def draw_bodypose(canvas, candidate, subset):
             X = candidate[index.astype(int), 1]
             mX = np.mean(X)
             mY = np.mean(Y)
-            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
             angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
-                                       (int(length / 2), stickwidth),
-                                       int(angle), 0, 360, 1)
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
             cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
             canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
     return canvas
@@ -158,9 +155,9 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
                     canvas,
                     (x1, y1),
                     (x2, y2),
-                    matplotlib.colors.hsv_to_rgb(
-                        [ie / float(len(edges)), 1.0, 1.0]) * 255,
-                    thickness=2, )
+                    matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255,
+                    thickness=2,
+                )
 
         for i, keyponit in enumerate(peaks):
             x, y = keyponit
@@ -173,7 +170,8 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
                     cv2.FONT_HERSHEY_SIMPLEX,
                     0.3,
                     (0, 0, 0),
-                    lineType=cv2.LINE_AA, )
+                    lineType=cv2.LINE_AA,
+                )
     return canvas
 
 
@@ -194,16 +192,14 @@ def hand_detect(candidate, subset, oriImg):
         hands = []
         # left hand
         if has_left:
-            left_shoulder_index, left_elbow_index, left_wrist_index = person[
-                [5, 6, 7]]
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
             x1, y1 = candidate[left_shoulder_index][:2]
             x2, y2 = candidate[left_elbow_index][:2]
             x3, y3 = candidate[left_wrist_index][:2]
             hands.append([x1, y1, x2, y2, x3, y3, True])
         # right hand
         if has_right:
-            right_shoulder_index, right_elbow_index, right_wrist_index = person[
-                [2, 3, 4]]
+            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
             x1, y1 = candidate[right_shoulder_index][:2]
             x2, y2 = candidate[right_elbow_index][:2]
             x3, y3 = candidate[right_wrist_index][:2]
@@ -218,8 +214,8 @@ def hand_detect(candidate, subset, oriImg):
             # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
             x = x3 + ratioWristElbow * (x3 - x2)
             y = y3 + ratioWristElbow * (y3 - y2)
-            distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2)
-            distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
+            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
             width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
             # x-y refers to the center --> offset to topLeft point
             # handRectangle.x -= handRectangle.width / 2.f;
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
index 0bb742e72d02a..d2d5ee7249851 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
@@ -39,10 +39,8 @@ def keypoint_to_openpose_kpts(coco_keypoints_list):
     l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index]
     r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index]
 
-    neck_keypoint_y = int(
-        (l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
-    neck_keypoint_x = int(
-        (l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
+    neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
+    neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
     neck_keypoint = [
         neck_keypoint_x,
         neck_keypoint_y,
@@ -65,33 +63,24 @@ def __call__(self, oriImg, detect_resolution=512, hand=False):
             img_scalarfactor = detect_resolution / min(oriImg.shape[:2])
             result = self.ppdetpose_pred(oriImg)
             result["candidate"] = result["candidate"] * img_scalarfactor
-            oriImg = cv2.resize(
-                oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
+            oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
             canvas = oriImg.copy()
             canvas.fill(0)
-            canvas = self.body_estimation.draw_pose(canvas, result["candidate"],
-                                                    result["subset"])
+            canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
             if hand:
-                hands_list = util.hand_detect(result["candidate"],
-                                              result["subset"], oriImg)
+                hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg)
                 all_hand_peaks = []
                 for x, y, w, is_left in hands_list:
-                    scale_search = [
-                        x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0]
-                    ]
+                    scale_search = [x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0]]
                     peaks = self.hand_estimation.hand_estimation(
-                        oriImg[y:y + w, x:x + w, ::-1],
-                        scale_search=scale_search)
-                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0],
-                                           peaks[:, 0] + x)
-                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1],
-                                           peaks[:, 1] + y)
+                        oriImg[y : y + w, x : x + w, ::-1], scale_search=scale_search
+                    )
+                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
+                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
                     all_hand_peaks.append(peaks)
                 canvas = util.draw_handpose(canvas, all_hand_peaks)
 
-            return canvas, dict(
-                candidate=result["candidate"].tolist(),
-                subset=result["subset"].tolist())
+            return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist())
 
     def ppdetpose_pred(self, image, kpt_threshold=0.3):
         poseres = self.ppdetpose.ppdet_hrnet_infer(image)
@@ -105,7 +94,12 @@ def ppdetpose_pred(self, image, kpt_threshold=0.3):
             for idx, item in enumerate(openpose_kpts):
                 if item[2] > kpt_threshold:
                     subset[kptid][idx] = posnum
-                    kpt = np.array(item + [posnum, ])
+                    kpt = np.array(
+                        item
+                        + [
+                            posnum,
+                        ]
+                    )
                     candidate = np.vstack((candidate, kpt))
                     posnum += 1
         return {"candidate": candidate, "subset": subset}
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
index 86f7aca10c143..9236875761299 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
@@ -25,13 +25,14 @@
 
 class PaddleInferBenchmark(object):
     def __init__(
-            self,
-            config,
-            model_info: dict={},
-            data_info: dict={},
-            perf_info: dict={},
-            resource_info: dict={},
-            **kwargs, ):
+        self,
+        config,
+        model_info: dict = {},
+        data_info: dict = {},
+        perf_info: dict = {},
+        resource_info: dict = {},
+        **kwargs,
+    ):
         """
         Construct PaddleInferBenchmark Class to format logs.
         args:
@@ -84,8 +85,7 @@ def __init__(
             self.inference_time_s = round(perf_info["inference_time_s"], 4)
         except:
             self.print_help()
-            raise ValueError(
-                "Set argument wrong, please check input argument and its type")
+            raise ValueError("Set argument wrong, please check input argument and its type")
 
         self.preprocess_time_s = perf_info.get("preprocess_time_s", 0)
         self.postprocess_time_s = perf_info.get("postprocess_time_s", 0)
@@ -142,13 +142,12 @@ def benchmark_logger(self):
             level=logging.INFO,
             format=FORMAT,
             handlers=[
-                logging.FileHandler(
-                    filename=log_output, mode="w"),
+                logging.FileHandler(filename=log_output, mode="w"),
                 logging.StreamHandler(),
-            ], )
+            ],
+        )
         self.logger = logging.getLogger(__name__)
-        self.logger.info(
-            f"Paddle Inference benchmark log will be saved to {log_output}")
+        self.logger.info(f"Paddle Inference benchmark log will be saved to {log_output}")
 
     def parse_config(self, config) -> dict:
         """
@@ -160,28 +159,22 @@ def parse_config(self, config) -> dict:
         """
         if isinstance(config, paddle_infer.Config):
             config_status = {}
-            config_status["runtime_device"] = "gpu" if config.use_gpu(
-            ) else "cpu"
+            config_status["runtime_device"] = "gpu" if config.use_gpu() else "cpu"
             config_status["ir_optim"] = config.ir_optim()
             config_status["enable_tensorrt"] = config.tensorrt_engine_enabled()
             config_status["precision"] = self.precision
             config_status["enable_mkldnn"] = config.mkldnn_enabled()
-            config_status[
-                "cpu_math_library_num_threads"] = config.cpu_math_library_num_threads(
-                )
+            config_status["cpu_math_library_num_threads"] = config.cpu_math_library_num_threads()
         elif isinstance(config, dict):
             config_status["runtime_device"] = config.get("runtime_device", "")
             config_status["ir_optim"] = config.get("ir_optim", "")
             config_status["enable_tensorrt"] = config.get("enable_tensorrt", "")
             config_status["precision"] = config.get("precision", "")
             config_status["enable_mkldnn"] = config.get("enable_mkldnn", "")
-            config_status["cpu_math_library_num_threads"] = config.get(
-                "cpu_math_library_num_threads", "")
+            config_status["cpu_math_library_num_threads"] = config.get("cpu_math_library_num_threads", "")
         else:
             self.print_help()
-            raise ValueError(
-                "Set argument config wrong, please check input argument and its type"
-            )
+            raise ValueError("Set argument config wrong, please check input argument and its type")
         return config_status
 
     def report(self, identifier=None):
@@ -196,54 +189,43 @@ def report(self, identifier=None):
             identifier = ""
 
         self.logger.info("\n")
-        self.logger.info(
-            "---------------------- Paddle info ----------------------")
+        self.logger.info("---------------------- Paddle info ----------------------")
         self.logger.info(f"{identifier} paddle_version: {self.paddle_version}")
         self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}")
         self.logger.info(f"{identifier} paddle_branch: {self.paddle_branch}")
         self.logger.info(f"{identifier} log_api_version: {self.log_version}")
-        self.logger.info(
-            "----------------------- Conf info -----------------------")
-        self.logger.info(
-            f"{identifier} runtime_device: {self.config_status['runtime_device']}"
-        )
-        self.logger.info(
-            f"{identifier} ir_optim: {self.config_status['ir_optim']}")
+        self.logger.info("----------------------- Conf info -----------------------")
+        self.logger.info(f"{identifier} runtime_device: {self.config_status['runtime_device']}")
+        self.logger.info(f"{identifier} ir_optim: {self.config_status['ir_optim']}")
         self.logger.info(f"{identifier} enable_memory_optim: {True}")
-        self.logger.info(
-            f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}"
-        )
-        self.logger.info(
-            f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
+        self.logger.info(f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}")
+        self.logger.info(f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
         self.logger.info(
             f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}"
         )
-        self.logger.info(
-            "----------------------- Model info ----------------------")
+        self.logger.info("----------------------- Model info ----------------------")
         self.logger.info(f"{identifier} model_name: {self.model_name}")
         self.logger.info(f"{identifier} precision: {self.precision}")
-        self.logger.info(
-            "----------------------- Data info -----------------------")
+        self.logger.info("----------------------- Data info -----------------------")
         self.logger.info(f"{identifier} batch_size: {self.batch_size}")
         self.logger.info(f"{identifier} input_shape: {self.shape}")
         self.logger.info(f"{identifier} data_num: {self.data_num}")
-        self.logger.info(
-            "----------------------- Perf info -----------------------")
+        self.logger.info("----------------------- Perf info -----------------------")
         self.logger.info(
             f"{identifier} cpu_rss(MB): {self.cpu_rss_mb}, cpu_vms: {self.cpu_vms_mb}, cpu_shared_mb: {self.cpu_shared_mb}, cpu_dirty_mb: {self.cpu_dirty_mb}, cpu_util: {self.cpu_util}%"
         )
         self.logger.info(
             f"{identifier} gpu_rss(MB): {self.gpu_rss_mb}, gpu_util: {self.gpu_util}%, gpu_mem_util: {self.gpu_mem_util}%"
         )
-        self.logger.info(
-            f"{identifier} total time spent(s): {self.total_time_s}")
+        self.logger.info(f"{identifier} total time spent(s): {self.total_time_s}")
 
         if self.with_tracker:
             self.logger.info(
                 f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
                 f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
                 f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, "
-                f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}")
+                f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}"
+            )
         else:
             self.logger.info(
                 f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
@@ -261,7 +243,8 @@ def print_help(self):
         """
         print function help
         """
-        print("""Usage:
+        print(
+            """Usage:
             ==== Print inference benchmark logs. ====
             config = paddle.inference.Config()
             model_info = {'model_name': 'resnet50'
@@ -278,7 +261,8 @@ def print_help(self):
                              'gpu_util': 60}
             log = PaddleInferBenchmark(config, model_info, data_info, perf_info, resource_info)
             log('Test')
-            """)
+            """
+        )
 
     def __call__(self, identifier=None):
         """
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
index a89c4c830c5be..3d3a8578fd2bd 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
@@ -24,8 +24,7 @@
 
 from .det_keypoint_unite_utils import argsparser
 from .infer import PredictConfig  # noqa F401
-from .infer import (Detector, DetectorPicoDet, bench_log, get_test_images,
-                    print_arguments)
+from .infer import bench_log, get_test_images, print_arguments
 from .keypoint_infer import KeyPointDetector
 from .keypoint_postprocess import translate_to_ori_images
 from .preprocess import decode_image
@@ -38,12 +37,10 @@
 }
 
 
-def predict_with_given_det(image, det_res, keypoint_detector,
-                           keypoint_batch_size, run_benchmark):
+def predict_with_given_det(image, det_res, keypoint_detector, keypoint_batch_size, run_benchmark):
     keypoint_res = {}
 
-    rec_images, records, det_rects = keypoint_detector.get_person_from_rect(
-        image, det_res)
+    rec_images, records, det_rects = keypoint_detector.get_person_from_rect(image, det_res)
 
     if len(det_rects) == 0:
         keypoint_res["keypoint"] = [[], []]
@@ -53,23 +50,22 @@ def predict_with_given_det(image, det_res, keypoint_detector,
     score_vector = []
 
     rect_vector = det_rects
-    keypoint_results = keypoint_detector.predict_image(
-        rec_images, run_benchmark, repeats=10, visual=False)
-    keypoint_vector, score_vector = translate_to_ori_images(keypoint_results,
-                                                            np.array(records))
+    keypoint_results = keypoint_detector.predict_image(rec_images, run_benchmark, repeats=10, visual=False)
+    keypoint_vector, score_vector = translate_to_ori_images(keypoint_results, np.array(records))
     keypoint_res["keypoint"] = (
-        [keypoint_vector.tolist(), score_vector.tolist()]
-        if len(keypoint_vector) > 0 else [[], []])
+        [keypoint_vector.tolist(), score_vector.tolist()] if len(keypoint_vector) > 0 else [[], []]
+    )
     keypoint_res["bbox"] = rect_vector
     return keypoint_res
 
 
 def topdown_unite_predict(
-        detector,
-        topdown_keypoint_detector,
-        image_list,
-        keypoint_batch_size=1,
-        save_res=False, ):
+    detector,
+    topdown_keypoint_detector,
+    image_list,
+    keypoint_batch_size=1,
+    save_res=False,
+):
     det_timer = detector.get_timer()
     store_res = []
     for i, img_file in enumerate(image_list):
@@ -79,8 +75,7 @@ def topdown_unite_predict(
         det_timer.preprocess_time_s.end()
 
         if FLAGS.run_benchmark:
-            results = detector.predict_image(
-                [image], run_benchmark=True, repeats=10)
+            results = detector.predict_image([image], run_benchmark=True, repeats=10)
 
             cm, gm, gu = get_current_memory_mb()
             detector.cpu_mem += cm
@@ -95,15 +90,18 @@ def topdown_unite_predict(
                 results,
                 topdown_keypoint_detector,
                 keypoint_batch_size,
-                FLAGS.run_benchmark, )
+                FLAGS.run_benchmark,
+            )
 
             if save_res:
                 save_name = img_file if isinstance(img_file, str) else i
-                store_res.append([
-                    save_name,
-                    keypoint_res["bbox"],
-                    [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
-                ])
+                store_res.append(
+                    [
+                        save_name,
+                        keypoint_res["bbox"],
+                        [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
+                    ]
+                )
         else:
             results["keypoint"] = [[], []]
             keypoint_res = results
@@ -119,7 +117,8 @@ def topdown_unite_predict(
                 img_file,
                 keypoint_res,
                 visual_thresh=FLAGS.keypoint_threshold,
-                save_dir=FLAGS.output_dir, )
+                save_dir=FLAGS.output_dir,
+            )
     if save_res:
         """
         1) store_res: a list of image_data
@@ -133,18 +132,17 @@ def topdown_unite_predict(
 
 
 def topdown_unite_predict_singleimage(
-        detector,
-        topdown_keypoint_detector,
-        image,
-        keypoint_batch_size=8,
-        det_threshold=0.25, ):
+    detector,
+    topdown_keypoint_detector,
+    image,
+    keypoint_batch_size=8,
+    det_threshold=0.25,
+):
 
     results = detector.predict_image([image], visual=False)
     results = detector.filter_box(results, det_threshold)
     if results["boxes_num"] > 0:
-        keypoint_res = predict_with_given_det(image, results,
-                                              topdown_keypoint_detector,
-                                              keypoint_batch_size, False)
+        keypoint_res = predict_with_given_det(image, results, topdown_keypoint_detector, keypoint_batch_size, False)
 
     else:
         results["keypoint"] = [[], []]
@@ -153,11 +151,12 @@ def topdown_unite_predict_singleimage(
 
 
 def topdown_unite_predict_video(
-        detector,
-        topdown_keypoint_detector,
-        camera_id,
-        keypoint_batch_size=1,
-        save_res=False, ):
+    detector,
+    topdown_keypoint_detector,
+    camera_id,
+    keypoint_batch_size=1,
+    save_res=False,
+):
     video_name = "output.mp4"
     if camera_id != -1:
         capture = cv2.VideoCapture(camera_id)
@@ -174,12 +173,11 @@ def topdown_unite_predict_video(
     if not os.path.exists(FLAGS.output_dir):
         os.makedirs(FLAGS.output_dir)
     out_path = os.path.join(FLAGS.output_dir, video_name)
-    fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
     index = 0
     store_res = []
-    keypoint_smoothing = KeypointSmoothing(
-        width, height, filter_type=FLAGS.filter_type, beta=0.05)
+    keypoint_smoothing = KeypointSmoothing(width, height, filter_type=FLAGS.filter_type, beta=0.05)
 
     while 1:
         ret, frame = capture.read()
@@ -201,27 +199,25 @@ def topdown_unite_predict_video(
             results,
             topdown_keypoint_detector,
             keypoint_batch_size,
-            FLAGS.run_benchmark, )
+            FLAGS.run_benchmark,
+        )
 
         if FLAGS.smooth and len(keypoint_res["keypoint"][0]) == 1:
             current_keypoints = np.array(keypoint_res["keypoint"][0][0])
-            smooth_keypoints = keypoint_smoothing.smooth_process(
-                current_keypoints)
+            smooth_keypoints = keypoint_smoothing.smooth_process(current_keypoints)
 
             keypoint_res["keypoint"][0][0] = smooth_keypoints.tolist()
 
-        im = visualize_pose(
-            frame,
-            keypoint_res,
-            visual_thresh=FLAGS.keypoint_threshold,
-            returnimg=True)
+        im = visualize_pose(frame, keypoint_res, visual_thresh=FLAGS.keypoint_threshold, returnimg=True)
 
         if save_res:
-            store_res.append([
-                index,
-                keypoint_res["bbox"],
-                [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
-            ])
+            store_res.append(
+                [
+                    index,
+                    keypoint_res["bbox"],
+                    [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]],
+                ]
+            )
 
         writer.write(im)
         if camera_id != -1:
@@ -247,37 +243,43 @@ class KeypointSmoothing(object):
     # https://github.com/jaantollander/OneEuroFilter
 
     def __init__(
-            self,
-            width,
-            height,
-            filter_type,
-            alpha=0.5,
-            fc_d=0.1,
-            fc_min=0.1,
-            beta=0.1,
-            thres_mult=0.3, ):
+        self,
+        width,
+        height,
+        filter_type,
+        alpha=0.5,
+        fc_d=0.1,
+        fc_min=0.1,
+        beta=0.1,
+        thres_mult=0.3,
+    ):
         super(KeypointSmoothing, self).__init__()
         self.image_width = width
         self.image_height = height
-        self.threshold = (np.array([
-            0.005,
-            0.005,
-            0.005,
-            0.005,
-            0.005,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-            0.01,
-        ]) * thres_mult)
+        self.threshold = (
+            np.array(
+                [
+                    0.005,
+                    0.005,
+                    0.005,
+                    0.005,
+                    0.005,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                    0.01,
+                ]
+            )
+            * thres_mult
+        )
         self.filter_type = filter_type
         self.alpha = alpha
         self.dx_prev_hat = None
@@ -302,20 +304,18 @@ def smooth_process(self, current_keypoints):
             result = current_keypoints
             num_keypoints = len(current_keypoints)
             for i in range(num_keypoints):
-                result[i, :2] = self.smooth(current_keypoints[i, :2],
-                                            self.threshold[i], i)
+                result[i, :2] = self.smooth(current_keypoints[i, :2], self.threshold[i], i)
             return result
 
     def smooth(self, current_keypoint, threshold, index):
         distance = np.sqrt(
-            np.square((current_keypoint[0] - self.x_prev_hat[index][0]) /
-                      self.image_width) + np.square((current_keypoint[
-                          1] - self.x_prev_hat[index][1]) / self.image_height))
+            np.square((current_keypoint[0] - self.x_prev_hat[index][0]) / self.image_width)
+            + np.square((current_keypoint[1] - self.x_prev_hat[index][1]) / self.image_height)
+        )
         if distance < threshold:
             result = self.x_prev_hat[index]
         else:
-            result = self.smooth_func(current_keypoint, self.x_prev_hat[index],
-                                      index)
+            result = self.smooth_func(current_keypoint, self.x_prev_hat[index], index)
 
         return result
 
@@ -360,15 +360,13 @@ def exponential_smoothing(self, x_cur, x_pre, index=0):
 det_threshold = 0.4
 
 if not os.path.exists(det_model_dir):
-    detmodel_url = "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip"
-    get_path_from_url_with_filelock(
-        detmodel_url, root_dir="annotator/ppdet_hrnet/models/")
-if not os.path.exists(keypoint_model_dir):
-    kptmodel_url = (
-        "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip"
+    detmodel_url = (
+        "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip"
     )
-    get_path_from_url_with_filelock(
-        kptmodel_url, root_dir="annotator/ppdet_hrnet/models/")
+    get_path_from_url_with_filelock(detmodel_url, root_dir="annotator/ppdet_hrnet/models/")
+if not os.path.exists(keypoint_model_dir):
+    kptmodel_url = "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip"
+    get_path_from_url_with_filelock(kptmodel_url, root_dir="annotator/ppdet_hrnet/models/")
 
 
 class PPDetPose(object):
@@ -391,7 +389,8 @@ def __init__(self) -> None:
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
             enable_mkldnn=enable_mkldnn,
-            threshold=det_threshold, )
+            threshold=det_threshold,
+        )
 
         self.topdown_keypoint_detector = KeyPointDetector(
             keypoint_model_dir,
@@ -404,7 +403,8 @@ def __init__(self) -> None:
             trt_calib_mode=trt_calib_mode,
             cpu_threads=cpu_threads,
             enable_mkldnn=enable_mkldnn,
-            use_dark=use_dark, )
+            use_dark=use_dark,
+        )
         keypoint_arch = self.topdown_keypoint_detector.pred_config.arch
         assert (
             KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown"
@@ -417,7 +417,8 @@ def ppdet_hrnet_infer(self, image):
             self.topdown_keypoint_detector,
             image,
             keypoint_batch_size,
-            det_threshold, )
+            det_threshold,
+        )
 
 
 def main():
@@ -439,7 +440,8 @@ def main():
         trt_calib_mode=FLAGS.trt_calib_mode,
         cpu_threads=FLAGS.cpu_threads,
         enable_mkldnn=FLAGS.enable_mkldnn,
-        threshold=FLAGS.det_threshold, )
+        threshold=FLAGS.det_threshold,
+    )
 
     topdown_keypoint_detector = KeyPointDetector(
         FLAGS.keypoint_model_dir,
@@ -452,7 +454,8 @@ def main():
         trt_calib_mode=FLAGS.trt_calib_mode,
         cpu_threads=FLAGS.cpu_threads,
         enable_mkldnn=FLAGS.enable_mkldnn,
-        use_dark=FLAGS.use_dark, )
+        use_dark=FLAGS.use_dark,
+    )
     keypoint_arch = topdown_keypoint_detector.pred_config.arch
     assert (
         KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown"
@@ -465,7 +468,8 @@ def main():
             topdown_keypoint_detector,
             FLAGS.camera_id,
             FLAGS.keypoint_batch_size,
-            FLAGS.save_res, )
+            FLAGS.save_res,
+        )
     else:
         # predict from image
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
@@ -474,7 +478,8 @@ def main():
             topdown_keypoint_detector,
             img_list,
             FLAGS.keypoint_batch_size,
-            FLAGS.save_res, )
+            FLAGS.save_res,
+        )
         if not FLAGS.run_benchmark:
             detector.det_times.info(average=True)
             topdown_keypoint_detector.det_times.info(average=True)
@@ -496,7 +501,8 @@ def main():
                 img_list,
                 keypoint_model_info,
                 FLAGS.keypoint_batch_size,
-                "KeyPoint", )
+                "KeyPoint",
+            )
 
 
 if __name__ == "__main__":
@@ -505,7 +511,6 @@ def main():
     FLAGS = parser.parse_args()
     print_arguments(FLAGS)
     FLAGS.device = FLAGS.device.upper()
-    assert FLAGS.device in ["CPU", "GPU", "XPU"
-                            ], "device should be CPU, GPU or XPU"
+    assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU"
 
     main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
index 5290e03d818fa..0d023a6d28d57 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
@@ -22,58 +22,60 @@ def argsparser():
         "--det_model_dir",
         type=str,
         default=None,
-        help=("Directory include:'model.pdiparams', 'model.pdmodel', "
-              "'infer_cfg.yml', created by tools/export_model.py."),
-        required=True, )
+        help=(
+            "Directory include:'model.pdiparams', 'model.pdmodel', "
+            "'infer_cfg.yml', created by tools/export_model.py."
+        ),
+        required=True,
+    )
     parser.add_argument(
         "--keypoint_model_dir",
         type=str,
         default=None,
-        help=("Directory include:'model.pdiparams', 'model.pdmodel', "
-              "'infer_cfg.yml', created by tools/export_model.py."),
-        required=True, )
-    parser.add_argument(
-        "--image_file", type=str, default=None, help="Path of image file.")
+        help=(
+            "Directory include:'model.pdiparams', 'model.pdmodel', "
+            "'infer_cfg.yml', created by tools/export_model.py."
+        ),
+        required=True,
+    )
+    parser.add_argument("--image_file", type=str, default=None, help="Path of image file.")
     parser.add_argument(
         "--image_dir",
         type=str,
         default=None,
-        help="Dir of image file, `image_file` has a higher priority.", )
+        help="Dir of image file, `image_file` has a higher priority.",
+    )
     parser.add_argument(
         "--keypoint_batch_size",
         type=int,
         default=8,
-        help=("batch_size for keypoint inference. In detection-keypoint unit"
-              "inference, the batch size in detection is 1. Then collate det "
-              "result in batch for keypoint inference."), )
+        help=(
+            "batch_size for keypoint inference. In detection-keypoint unit"
+            "inference, the batch size in detection is 1. Then collate det "
+            "result in batch for keypoint inference."
+        ),
+    )
     parser.add_argument(
         "--video_file",
         type=str,
         default=None,
         help="Path of video file, `video_file` or `camera_id` has a highest priority.",
     )
-    parser.add_argument(
-        "--camera_id",
-        type=int,
-        default=-1,
-        help="device id of camera to predict.")
-    parser.add_argument(
-        "--det_threshold", type=float, default=0.5, help="Threshold of score.")
-    parser.add_argument(
-        "--keypoint_threshold",
-        type=float,
-        default=0.5,
-        help="Threshold of score.")
+    parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.")
+    parser.add_argument("--det_threshold", type=float, default=0.5, help="Threshold of score.")
+    parser.add_argument("--keypoint_threshold", type=float, default=0.5, help="Threshold of score.")
     parser.add_argument(
         "--output_dir",
         type=str,
         default="output",
-        help="Directory of output visualization files.", )
+        help="Directory of output visualization files.",
+    )
     parser.add_argument(
         "--run_mode",
         type=str,
         default="paddle",
-        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", )
+        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)",
+    )
     parser.add_argument(
         "--device",
         type=str,
@@ -84,32 +86,24 @@ def argsparser():
         "--run_benchmark",
         type=ast.literal_eval,
         default=False,
-        help="Whether to predict a image_file repeatedly for benchmark", )
+        help="Whether to predict a image_file repeatedly for benchmark",
+    )
     parser.add_argument(
         "--enable_mkldnn",
         type=ast.literal_eval,
         default=False,
-        help="Whether use mkldnn with CPU.", )
-    parser.add_argument(
-        "--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
-    parser.add_argument(
-        "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
-    parser.add_argument(
-        "--trt_max_shape",
-        type=int,
-        default=1280,
-        help="max_shape for TensorRT.")
-    parser.add_argument(
-        "--trt_opt_shape",
-        type=int,
-        default=640,
-        help="opt_shape for TensorRT.")
+        help="Whether use mkldnn with CPU.",
+    )
+    parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
+    parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
+    parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.")
+    parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.")
     parser.add_argument(
         "--trt_calib_mode",
         type=bool,
         default=False,
-        help="If the model is produced by TRT offline quantitative "
-        "calibration, trt_calib_mode need to set True.", )
+        help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.",
+    )
     parser.add_argument(
         "--use_dark",
         type=ast.literal_eval,
@@ -126,7 +120,9 @@ def argsparser():
             "2) image_data: [imageid, rects, [keypoints, scores]]"
             "3) rects: list of rect [xmin, ymin, xmax, ymax]"
             "4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list"
-            "5) scores: mean of all joint conf"), )
+            "5) scores: mean of all joint conf"
+        ),
+    )
     parser.add_argument(
         "--smooth",
         type=ast.literal_eval,
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
index a2a9769e224e0..6d4135cdfb9a6 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
@@ -25,16 +25,17 @@
 from paddle.inference import Config, create_predictor
 
 from .benchmark_utils import PaddleInferBenchmark
-from .keypoint_preprocess import (
-    EvalAffine,
-    TopDownEvalAffine,  # noqa F401
-    expand_crop)
+from .keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop  # noqa F401
 from .picodet_postprocess import PicoDetPostProcess
 from .preprocess import Pad  # noqa F401
-from .preprocess import (LetterBoxResize, NormalizeImage, PadStride, Permute,
-                         Resize, WarpAffine, decode_image, preprocess)
-from .utils import (Timer, argsparser, coco_clsid2catid, get_current_memory_mb,
-                    multiclass_nms)
+from .preprocess import preprocess
+from .utils import (
+    Timer,
+    argsparser,
+    coco_clsid2catid,
+    get_current_memory_mb,
+    multiclass_nms,
+)
 from .visualize import visualize_box_mask
 
 # Global dictionary
@@ -81,8 +82,7 @@ def bench_log(detector, img_list, model_info, batch_size=1, name=None):
         "shape": "dynamic_shape",
         "data_num": perf_info["img_num"],
     }
-    log = PaddleInferBenchmark(detector.config, model_info, data_info,
-                               perf_info, mems)
+    log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems)
     log(name)
 
 
@@ -109,21 +109,22 @@ class Detector(object):
     """
 
     def __init__(
-            self,
-            model_dir,
-            device="CPU",
-            run_mode="paddle",
-            batch_size=1,
-            trt_min_shape=1,
-            trt_max_shape=1280,
-            trt_opt_shape=640,
-            trt_calib_mode=False,
-            cpu_threads=1,
-            enable_mkldnn=False,
-            enable_mkldnn_bfloat16=False,
-            output_dir="output",
-            threshold=0.5,
-            delete_shuffle_pass=False, ):
+        self,
+        model_dir,
+        device="CPU",
+        run_mode="paddle",
+        batch_size=1,
+        trt_min_shape=1,
+        trt_max_shape=1280,
+        trt_opt_shape=640,
+        trt_calib_mode=False,
+        cpu_threads=1,
+        enable_mkldnn=False,
+        enable_mkldnn_bfloat16=False,
+        output_dir="output",
+        threshold=0.5,
+        delete_shuffle_pass=False,
+    ):
         self.pred_config = self.set_config(model_dir)
         self.predictor, self.config = load_predictor(
             model_dir,
@@ -140,7 +141,8 @@ def __init__(
             cpu_threads=cpu_threads,
             enable_mkldnn=enable_mkldnn,
             enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
-            delete_shuffle_pass=delete_shuffle_pass, )
+            delete_shuffle_pass=delete_shuffle_pass,
+        )
         self.det_times = Timer()
         self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
         self.batch_size = batch_size
@@ -177,9 +179,7 @@ def preprocess(self, image_list):
     def postprocess(self, inputs, result):
         # postprocess output of predictor
         np_boxes_num = result["boxes_num"]
-        assert isinstance(
-            np_boxes_num,
-            np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`"
+        assert isinstance(np_boxes_num, np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`"
 
         result = {k: v for k, v in result.items() if v is not None}
         return result
@@ -192,7 +192,7 @@ def filter_box(self, result, threshold):
         filter_num = []
         for i in range(len(np_boxes_num)):
             boxes_num = np_boxes_num[i]
-            boxes_i = boxes[start_idx:start_idx + boxes_num, :]
+            boxes_i = boxes[start_idx : start_idx + boxes_num, :]
             idx = boxes_i[:, 1] > threshold
             filter_boxes_i = boxes_i[idx, :]
             filter_boxes.append(filter_boxes_i)
@@ -220,8 +220,7 @@ def predict(self, repeats=1, run_benchmark=False):
             for i in range(repeats):
                 self.predictor.run()
                 paddle.device.cuda.synchronize()
-            result = dict(
-                boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+            result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
             return result
 
         for i in range(repeats):
@@ -258,17 +257,18 @@ def get_timer(self):
         return self.det_times
 
     def predict_image_slice(
-            self,
-            img_list,
-            slice_size=[640, 640],
-            overlap_ratio=[0.25, 0.25],
-            combine_method="nms",
-            match_threshold=0.6,
-            match_metric="ios",
-            run_benchmark=False,
-            repeats=1,
-            visual=True,
-            save_results=False, ):
+        self,
+        img_list,
+        slice_size=[640, 640],
+        overlap_ratio=[0.25, 0.25],
+        combine_method="nms",
+        match_threshold=0.6,
+        match_metric="ios",
+        run_benchmark=False,
+        repeats=1,
+        visual=True,
+        save_results=False,
+    ):
         # slice infer only support bs=1
         results = []
         try:
@@ -287,14 +287,13 @@ def predict_image_slice(
                 slice_height=slice_size[0],
                 slice_width=slice_size[1],
                 overlap_height_ratio=overlap_ratio[0],
-                overlap_width_ratio=overlap_ratio[1], )
+                overlap_width_ratio=overlap_ratio[1],
+            )
             sub_img_num = len(slice_image_result)
             merged_bboxs = []
             print("slice to {} sub_samples.", sub_img_num)
 
-            batch_image_list = [
-                slice_image_result.images[_ind] for _ind in range(sub_img_num)
-            ]
+            batch_image_list = [slice_image_result.images[_ind] for _ind in range(sub_img_num)]
             if run_benchmark:
                 # preprocess
                 inputs = self.preprocess(batch_image_list)  # warmup
@@ -341,10 +340,8 @@ def predict_image_slice(
                 boxes_num = result["boxes_num"][_ind]
                 ed = st + boxes_num
                 shift_amount = slice_image_result.starting_pixels[_ind]
-                result["boxes"][st:ed][:, 2:4] = (
-                    result["boxes"][st:ed][:, 2:4] + shift_amount)
-                result["boxes"][st:ed][:, 4:6] = (
-                    result["boxes"][st:ed][:, 4:6] + shift_amount)
+                result["boxes"][st:ed][:, 2:4] = result["boxes"][st:ed][:, 2:4] + shift_amount
+                result["boxes"][st:ed][:, 4:6] = result["boxes"][st:ed][:, 4:6] + shift_amount
                 merged_bboxs.append(result["boxes"][st:ed])
                 st = ed
 
@@ -354,16 +351,14 @@ def predict_image_slice(
                     np.concatenate(merged_bboxs),
                     num_classes,
                     match_threshold,
-                    match_metric, )
+                    match_metric,
+                )
                 merged_results["boxes"] = np.concatenate(final_boxes)
             elif combine_method == "concat":
                 merged_results["boxes"] = np.concatenate(merged_bboxs)
             else:
-                raise ValueError(
-                    "Now only support 'nms' or 'concat' to fuse detection results."
-                )
-            merged_results["boxes_num"] = np.array(
-                [len(merged_results["boxes"])], dtype=np.int32)
+                raise ValueError("Now only support 'nms' or 'concat' to fuse detection results.")
+            merged_results["boxes_num"] = np.array([len(merged_results["boxes"])], dtype=np.int32)
 
             if visual:
                 visualize(
@@ -371,24 +366,25 @@ def predict_image_slice(
                     merged_results,
                     self.pred_config.labels,
                     output_dir=self.output_dir,
-                    threshold=self.threshold, )
+                    threshold=self.threshold,
+                )
 
             results.append(merged_results)
 
         results = self.merge_batch_result(results)
         if save_results:
             Path(self.output_dir).mkdir(exist_ok=True)
-            self.save_coco_results(
-                img_list, results, use_coco_category=FLAGS.use_coco_category)
+            self.save_coco_results(img_list, results, use_coco_category=FLAGS.use_coco_category)
         return results
 
     def predict_image(
-            self,
-            image_list,
-            run_benchmark=False,
-            repeats=1,
-            visual=True,
-            save_results=False, ):
+        self,
+        image_list,
+        run_benchmark=False,
+        repeats=1,
+        visual=True,
+        save_results=False,
+    ):
         batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
         results = []
         for i in range(batch_loop_cnt):
@@ -442,13 +438,13 @@ def predict_image(
                         result,
                         self.pred_config.labels,
                         output_dir=self.output_dir,
-                        threshold=self.threshold, )
+                        threshold=self.threshold,
+                    )
             results.append(result)
         results = self.merge_batch_result(results)
         if save_results:
             Path(self.output_dir).mkdir(exist_ok=True)
-            self.save_coco_results(
-                image_list, results, use_coco_category=FLAGS.use_coco_category)
+            self.save_coco_results(image_list, results, use_coco_category=FLAGS.use_coco_category)
         return results
 
     def predict_video(self, video_file, camera_id):
@@ -468,7 +464,7 @@ def predict_video(self, video_file, camera_id):
         if not os.path.exists(self.output_dir):
             os.makedirs(self.output_dir)
         out_path = os.path.join(self.output_dir, video_out_name)
-        fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
         index = 1
         while 1:
@@ -479,11 +475,7 @@ def predict_video(self, video_file, camera_id):
             index += 1
             results = self.predict_image([frame[:, :, ::-1]], visual=False)
 
-            im = visualize_box_mask(
-                frame,
-                results,
-                self.pred_config.labels,
-                threshold=self.threshold)
+            im = visualize_box_mask(frame, results, self.pred_config.labels, threshold=self.threshold)
             im = np.array(im)
             writer.write(im)
             if camera_id != -1:
@@ -505,43 +497,44 @@ def save_coco_results(self, image_list, results, use_coco_category=False):
                 img_id = i
 
             if "boxes" in results:
-                boxes = results["boxes"][idx:idx + box_num].tolist()
-                bbox_results.extend([
-                    {
-                        "image_id": img_id,
-                        "category_id": coco_clsid2catid[int(box[0])]
-                        if use_coco_category else int(box[0]),
-                        "file_name": file_name,
-                        "bbox": [
-                            box[2],
-                            box[3],
-                            box[4] - box[2],
-                            box[5] - box[3],
-                        ],  # xyxy -> xywh
-                        "score": box[1],
-                    } for box in boxes
-                ])
+                boxes = results["boxes"][idx : idx + box_num].tolist()
+                bbox_results.extend(
+                    [
+                        {
+                            "image_id": img_id,
+                            "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]),
+                            "file_name": file_name,
+                            "bbox": [
+                                box[2],
+                                box[3],
+                                box[4] - box[2],
+                                box[5] - box[3],
+                            ],  # xyxy -> xywh
+                            "score": box[1],
+                        }
+                        for box in boxes
+                    ]
+                )
 
             if "masks" in results:
                 import pycocotools.mask as mask_util
 
-                boxes = results["boxes"][idx:idx + box_num].tolist()
+                boxes = results["boxes"][idx : idx + box_num].tolist()
                 masks = results["masks"][i][:box_num].astype(np.uint8)
                 seg_res = []
                 for box, mask in zip(boxes, masks):
-                    rle = mask_util.encode(
-                        np.array(
-                            mask[:, :, None], dtype=np.uint8, order="F"))[0]
+                    rle = mask_util.encode(np.array(mask[:, :, None], dtype=np.uint8, order="F"))[0]
                     if "counts" in rle:
                         rle["counts"] = rle["counts"].decode("utf8")
-                    seg_res.append({
-                        "image_id": img_id,
-                        "category_id": coco_clsid2catid[int(box[0])]
-                        if use_coco_category else int(box[0]),
-                        "file_name": file_name,
-                        "segmentation": rle,
-                        "score": box[1],
-                    })
+                    seg_res.append(
+                        {
+                            "image_id": img_id,
+                            "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]),
+                            "file_name": file_name,
+                            "segmentation": rle,
+                            "score": box[1],
+                        }
+                    )
                 mask_results.extend(seg_res)
 
             idx += box_num
@@ -579,20 +572,21 @@ class DetectorSOLOv2(Detector):
     """
 
     def __init__(
-            self,
-            model_dir,
-            device="CPU",
-            run_mode="paddle",
-            batch_size=1,
-            trt_min_shape=1,
-            trt_max_shape=1280,
-            trt_opt_shape=640,
-            trt_calib_mode=False,
-            cpu_threads=1,
-            enable_mkldnn=False,
-            enable_mkldnn_bfloat16=False,
-            output_dir="./",
-            threshold=0.5, ):
+        self,
+        model_dir,
+        device="CPU",
+        run_mode="paddle",
+        batch_size=1,
+        trt_min_shape=1,
+        trt_max_shape=1280,
+        trt_opt_shape=640,
+        trt_calib_mode=False,
+        cpu_threads=1,
+        enable_mkldnn=False,
+        enable_mkldnn_bfloat16=False,
+        output_dir="./",
+        threshold=0.5,
+    ):
         super(DetectorSOLOv2, self).__init__(
             model_dir=model_dir,
             device=device,
@@ -606,7 +600,8 @@ def __init__(
             enable_mkldnn=enable_mkldnn,
             enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
             output_dir=output_dir,
-            threshold=threshold, )
+            threshold=threshold,
+        )
 
     def predict(self, repeats=1, run_benchmark=False):
         """
@@ -617,37 +612,24 @@ def predict(self, repeats=1, run_benchmark=False):
                             'cate_label': label of segm, shape:[N]
                             'cate_score': confidence score of segm, shape:[N]
         """
-        np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array(
-            [0])
+        np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array([0])
 
         if run_benchmark:
             for i in range(repeats):
                 self.predictor.run()
                 paddle.device.cuda.synchronize()
-            result = dict(
-                segm=np_segms,
-                label=np_label,
-                score=np_score,
-                boxes_num=np_boxes_num)
+            result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num)
             return result
 
         for i in range(repeats):
             self.predictor.run()
             output_names = self.predictor.get_output_names()
-            np_boxes_num = self.predictor.get_output_handle(output_names[
-                0]).copy_to_cpu()
-            np_label = self.predictor.get_output_handle(output_names[
-                1]).copy_to_cpu()
-            np_score = self.predictor.get_output_handle(output_names[
-                2]).copy_to_cpu()
-            np_segms = self.predictor.get_output_handle(output_names[
-                3]).copy_to_cpu()
-
-        result = dict(
-            segm=np_segms,
-            label=np_label,
-            score=np_score,
-            boxes_num=np_boxes_num)
+            np_boxes_num = self.predictor.get_output_handle(output_names[0]).copy_to_cpu()
+            np_label = self.predictor.get_output_handle(output_names[1]).copy_to_cpu()
+            np_score = self.predictor.get_output_handle(output_names[2]).copy_to_cpu()
+            np_segms = self.predictor.get_output_handle(output_names[3]).copy_to_cpu()
+
+        result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num)
         return result
 
 
@@ -669,20 +651,21 @@ class DetectorPicoDet(Detector):
     """
 
     def __init__(
-            self,
-            model_dir,
-            device="CPU",
-            run_mode="paddle",
-            batch_size=1,
-            trt_min_shape=1,
-            trt_max_shape=1280,
-            trt_opt_shape=640,
-            trt_calib_mode=False,
-            cpu_threads=1,
-            enable_mkldnn=False,
-            enable_mkldnn_bfloat16=False,
-            output_dir="./",
-            threshold=0.5, ):
+        self,
+        model_dir,
+        device="CPU",
+        run_mode="paddle",
+        batch_size=1,
+        trt_min_shape=1,
+        trt_max_shape=1280,
+        trt_opt_shape=640,
+        trt_calib_mode=False,
+        cpu_threads=1,
+        enable_mkldnn=False,
+        enable_mkldnn_bfloat16=False,
+        output_dir="./",
+        threshold=0.5,
+    ):
         super(DetectorPicoDet, self).__init__(
             model_dir=model_dir,
             device=device,
@@ -696,7 +679,8 @@ def __init__(
             enable_mkldnn=enable_mkldnn,
             enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
             output_dir=output_dir,
-            threshold=threshold, )
+            threshold=threshold,
+        )
 
     def postprocess(self, inputs, result):
         # postprocess output of predictor
@@ -707,7 +691,8 @@ def postprocess(self, inputs, result):
             inputs["im_shape"],
             inputs["scale_factor"],
             strides=self.pred_config.fpn_stride,
-            nms_threshold=self.pred_config.nms["nms_threshold"], )
+            nms_threshold=self.pred_config.nms["nms_threshold"],
+        )
         np_boxes, np_boxes_num = postprocessor(np_score_list, np_boxes_list)
         result = dict(boxes=np_boxes, boxes_num=np_boxes_num)
         return result
@@ -736,12 +721,8 @@ def predict(self, repeats=1, run_benchmark=False):
             output_names = self.predictor.get_output_names()
             num_outs = int(len(output_names) / 2)
             for out_idx in range(num_outs):
-                np_score_list.append(
-                    self.predictor.get_output_handle(output_names[out_idx])
-                    .copy_to_cpu())
-                np_boxes_list.append(
-                    self.predictor.get_output_handle(output_names[
-                        out_idx + num_outs]).copy_to_cpu())
+                np_score_list.append(self.predictor.get_output_handle(output_names[out_idx]).copy_to_cpu())
+                np_boxes_list.append(self.predictor.get_output_handle(output_names[out_idx + num_outs]).copy_to_cpu())
         result = dict(boxes=np_score_list, boxes_num=np_boxes_list)
         return result
 
@@ -759,16 +740,14 @@ def create_inputs(imgs, im_info):
     im_shape = []
     scale_factor = []
     if len(imgs) == 1:
-        inputs["image"] = np.array((imgs[0], )).astype("float32")
-        inputs["im_shape"] = np.array(
-            (im_info[0]["im_shape"], )).astype("float32")
-        inputs["scale_factor"] = np.array(
-            (im_info[0]["scale_factor"], )).astype("float32")
+        inputs["image"] = np.array((imgs[0],)).astype("float32")
+        inputs["im_shape"] = np.array((im_info[0]["im_shape"],)).astype("float32")
+        inputs["scale_factor"] = np.array((im_info[0]["scale_factor"],)).astype("float32")
         return inputs
 
     for e in im_info:
-        im_shape.append(np.array((e["im_shape"], )).astype("float32"))
-        scale_factor.append(np.array((e["scale_factor"], )).astype("float32"))
+        im_shape.append(np.array((e["im_shape"],)).astype("float32"))
+        scale_factor.append(np.array((e["scale_factor"],)).astype("float32"))
 
     inputs["im_shape"] = np.concatenate(im_shape, axis=0)
     inputs["scale_factor"] = np.concatenate(scale_factor, axis=0)
@@ -779,8 +758,7 @@ def create_inputs(imgs, im_info):
     padding_imgs = []
     for img in imgs:
         im_c, im_h, im_w = img.shape[:]
-        padding_im = np.zeros(
-            (im_c, max_shape_h, max_shape_w), dtype=np.float32)
+        padding_im = np.zeros((im_c, max_shape_h, max_shape_w), dtype=np.float32)
         padding_im[:, :im_h, :im_w] = img
         padding_imgs.append(padding_im)
     inputs["image"] = np.stack(padding_imgs, axis=0)
@@ -815,9 +793,7 @@ def __init__(self, model_dir):
         if "fpn_stride" in yml_conf:
             self.fpn_stride = yml_conf["fpn_stride"]
         if self.arch == "RCNN" and yml_conf.get("export_onnx", False):
-            print(
-                "The RCNN export model is used for ONNX and it only supports batch_size = 1"
-            )
+            print("The RCNN export model is used for ONNX and it only supports batch_size = 1")
         self.print_config()
 
     def check_model(self, yml_conf):
@@ -828,8 +804,7 @@ def check_model(self, yml_conf):
         for support_model in SUPPORT_MODELS:
             if support_model in yml_conf["arch"]:
                 return True
-        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
-            "arch"], SUPPORT_MODELS))
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], SUPPORT_MODELS))
 
     def print_config(self):
         print("-----------  Model Configuration -----------")
@@ -841,22 +816,23 @@ def print_config(self):
 
 
 def load_predictor(
-        model_dir,
-        arch,
-        run_mode="paddle",
-        batch_size=1,
-        device="CPU",
-        min_subgraph_size=3,
-        use_dynamic_shape=False,
-        trt_min_shape=1,
-        trt_max_shape=1280,
-        trt_opt_shape=640,
-        trt_calib_mode=False,
-        cpu_threads=1,
-        enable_mkldnn=False,
-        enable_mkldnn_bfloat16=False,
-        delete_shuffle_pass=False,
-        tuned_trt_shape_file="shape_range_info.pbtxt", ):
+    model_dir,
+    arch,
+    run_mode="paddle",
+    batch_size=1,
+    device="CPU",
+    min_subgraph_size=3,
+    use_dynamic_shape=False,
+    trt_min_shape=1,
+    trt_max_shape=1280,
+    trt_opt_shape=640,
+    trt_calib_mode=False,
+    cpu_threads=1,
+    enable_mkldnn=False,
+    enable_mkldnn_bfloat16=False,
+    delete_shuffle_pass=False,
+    tuned_trt_shape_file="shape_range_info.pbtxt",
+):
     """set AnalysisConfig, generate AnalysisPredictor
     Args:
         model_dir (str): root path of __model__ and __params__
@@ -877,16 +853,15 @@ def load_predictor(
     """
     if device != "GPU" and run_mode != "paddle":
         raise ValueError(
-            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".
-            format(run_mode, device))
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".format(run_mode, device)
+        )
     infer_model = os.path.join(model_dir, "model.pdmodel")
     infer_params = os.path.join(model_dir, "model.pdiparams")
     if not os.path.exists(infer_model):
         infer_model = os.path.join(model_dir, "inference.pdmodel")
         infer_params = os.path.join(model_dir, "inference.pdiparams")
         if not os.path.exists(infer_model):
-            raise ValueError(
-                "Cannot find any inference model in dir: {},".format(model_dir))
+            raise ValueError("Cannot find any inference model in dir: {},".format(model_dir))
     config = Config(infer_model, infer_params)
     if device == "GPU":
         # initial GPU memory(M), device ID
@@ -912,9 +887,7 @@ def load_predictor(
                 if enable_mkldnn_bfloat16:
                     config.enable_mkldnn_bfloat16()
             except:
-                print(
-                    "The current environment does not support `mkldnn`, so disable mkldnn."
-                )
+                print("The current environment does not support `mkldnn`, so disable mkldnn.")
                 pass
 
     precision_map = {
@@ -931,10 +904,10 @@ def load_predictor(
             min_subgraph_size=min_subgraph_size,
             precision_mode=precision_map[run_mode],
             use_static=False,
-            use_calib_mode=trt_calib_mode, )
+            use_calib_mode=trt_calib_mode,
+        )
         if arch in TUNED_TRT_DYNAMIC_MODELS:
-            config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file,
-                                                       True)
+            config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file, True)
 
         if use_dynamic_shape:
             min_input_shape = {
@@ -949,8 +922,7 @@ def load_predictor(
                 "image": [batch_size, 3, trt_opt_shape, trt_opt_shape],
                 "scale_factor": [batch_size, 2],
             }
-            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
-                                              opt_input_shape)
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
             print("trt set dynamic shape done!")
 
     # disable print log when predict
@@ -969,12 +941,9 @@ def get_test_images(infer_dir, infer_img):
     """
     Get image path list in TEST mode
     """
-    assert (infer_img is not None or
-            infer_dir is not None), "--image_file or --image_dir should be set"
-    assert infer_img is None or os.path.isfile(
-        infer_img), "{} is not a file".format(infer_img)
-    assert infer_dir is None or os.path.isdir(
-        infer_dir), "{} is not a directory".format(infer_dir)
+    assert infer_img is not None or infer_dir is not None, "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), "{} is not a directory".format(infer_dir)
 
     # infer_img has a higher priority
     if infer_img and os.path.isfile(infer_img):
@@ -982,8 +951,7 @@ def get_test_images(infer_dir, infer_img):
 
     images = set()
     infer_dir = os.path.abspath(infer_dir)
-    assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(
-        infer_dir)
+    assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(infer_dir)
     exts = ["jpg", "jpeg", "png", "bmp"]
     exts += [ext.upper() for ext in exts]
     for ext in exts:
@@ -1003,24 +971,18 @@ def visualize(image_list, result, labels, output_dir="output/", threshold=0.5):
         im_bboxes_num = result["boxes_num"][idx]
         im_results = {}
         if "boxes" in result:
-            im_results["boxes"] = result["boxes"][start_idx:start_idx +
-                                                  im_bboxes_num, :]
+            im_results["boxes"] = result["boxes"][start_idx : start_idx + im_bboxes_num, :]
         if "masks" in result:
-            im_results["masks"] = result["masks"][start_idx:start_idx +
-                                                  im_bboxes_num, :]
+            im_results["masks"] = result["masks"][start_idx : start_idx + im_bboxes_num, :]
         if "segm" in result:
-            im_results["segm"] = result["segm"][start_idx:start_idx +
-                                                im_bboxes_num, :]
+            im_results["segm"] = result["segm"][start_idx : start_idx + im_bboxes_num, :]
         if "label" in result:
-            im_results["label"] = result["label"][start_idx:start_idx +
-                                                  im_bboxes_num]
+            im_results["label"] = result["label"][start_idx : start_idx + im_bboxes_num]
         if "score" in result:
-            im_results["score"] = result["score"][start_idx:start_idx +
-                                                  im_bboxes_num]
+            im_results["score"] = result["score"][start_idx : start_idx + im_bboxes_num]
 
         start_idx += im_bboxes_num
-        im = visualize_box_mask(
-            image_file, im_results, labels, threshold=threshold)
+        im = visualize_box_mask(image_file, im_results, labels, threshold=threshold)
         img_name = os.path.split(image_file)[-1]
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
@@ -1060,7 +1022,8 @@ def main():
         enable_mkldnn=FLAGS.enable_mkldnn,
         enable_mkldnn_bfloat16=FLAGS.enable_mkldnn_bfloat16,
         threshold=FLAGS.threshold,
-        output_dir=FLAGS.output_dir, )
+        output_dir=FLAGS.output_dir,
+    )
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
@@ -1068,8 +1031,7 @@ def main():
     else:
         # predict from image
         if FLAGS.image_dir is None and FLAGS.image_file is not None:
-            assert (FLAGS.batch_size == 1
-                    ), "batch_size should be 1, when image_file is not None"
+            assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None"
         img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
         if FLAGS.slice_infer:
             detector.predict_image_slice(
@@ -1080,14 +1042,16 @@ def main():
                 FLAGS.match_threshold,
                 FLAGS.match_metric,
                 visual=FLAGS.save_images,
-                save_results=FLAGS.save_results, )
+                save_results=FLAGS.save_results,
+            )
         else:
             detector.predict_image(
                 img_list,
                 FLAGS.run_benchmark,
                 repeats=100,
                 visual=FLAGS.save_images,
-                save_results=FLAGS.save_results, )
+                save_results=FLAGS.save_results,
+            )
         if not FLAGS.run_benchmark:
             detector.det_times.info(average=True)
         else:
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
index 8f661fb65fe6b..fa3551f584493 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
@@ -52,20 +52,21 @@ class KeyPointDetector(Detector):
     """
 
     def __init__(
-            self,
-            model_dir,
-            device="CPU",
-            run_mode="paddle",
-            batch_size=1,
-            trt_min_shape=1,
-            trt_max_shape=1280,
-            trt_opt_shape=640,
-            trt_calib_mode=False,
-            cpu_threads=1,
-            enable_mkldnn=False,
-            output_dir="output",
-            threshold=0.5,
-            use_dark=True, ):
+        self,
+        model_dir,
+        device="CPU",
+        run_mode="paddle",
+        batch_size=1,
+        trt_min_shape=1,
+        trt_max_shape=1280,
+        trt_opt_shape=640,
+        trt_calib_mode=False,
+        cpu_threads=1,
+        enable_mkldnn=False,
+        output_dir="output",
+        threshold=0.5,
+        use_dark=True,
+    ):
         super(KeyPointDetector, self).__init__(
             model_dir=model_dir,
             device=device,
@@ -78,7 +79,8 @@ def __init__(
             cpu_threads=cpu_threads,
             enable_mkldnn=enable_mkldnn,
             output_dir=output_dir,
-            threshold=threshold, )
+            threshold=threshold,
+        )
         self.use_dark = use_dark
 
     def set_config(self, model_dir):
@@ -105,8 +107,7 @@ def postprocess(self, inputs, result):
         np_heatmap = result["heatmap"]
         np_masks = result["masks"]
         # postprocess output of predictor
-        if KEYPOINT_SUPPORT_MODELS[
-                self.pred_config.arch] == "keypoint_bottomup":
+        if KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_bottomup":
             results = {}
             h, w = inputs["im_shape"][0]
             preds = [np_heatmap]
@@ -118,8 +119,7 @@ def postprocess(self, inputs, result):
             results["keypoint"] = kpts
             results["score"] = scores
             return results
-        elif KEYPOINT_SUPPORT_MODELS[
-                self.pred_config.arch] == "keypoint_topdown":
+        elif KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_topdown":
             results = {}
             imshape = inputs["im_shape"][:, ::-1]
             center = np.round(imshape / 2.0)
@@ -130,8 +130,7 @@ def postprocess(self, inputs, result):
             results["score"] = scores
             return results
         else:
-            raise ValueError("Unsupported arch: {}, expect {}".format(
-                self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
+            raise ValueError("Unsupported arch: {}, expect {}".format(self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
 
     def predict(self, repeats=1):
         """
@@ -162,11 +161,7 @@ def predict(self, repeats=1):
         result = dict(heatmap=np_heatmap, masks=np_masks)
         return result
 
-    def predict_image(self,
-                      image_list,
-                      run_benchmark=False,
-                      repeats=1,
-                      visual=True):
+    def predict_image(self, image_list, run_benchmark=False, repeats=1, visual=True):
         results = []
         batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
         for i in range(batch_loop_cnt):
@@ -222,7 +217,8 @@ def predict_image(self,
                         batch_image_list,
                         result,
                         visual_thresh=self.threshold,
-                        save_dir=self.output_dir, )
+                        save_dir=self.output_dir,
+                    )
 
             results.append(result)
         results = self.merge_batch_result(results)
@@ -245,7 +241,7 @@ def predict_video(self, video_file, camera_id):
         if not os.path.exists(self.output_dir):
             os.makedirs(self.output_dir)
         out_path = os.path.join(self.output_dir, video_name)
-        fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
         index = 1
         while 1:
@@ -257,8 +253,7 @@ def predict_video(self, video_file, camera_id):
             results = self.predict_image([frame[:, :, ::-1]], visual=False)
             im_results = {}
             im_results["keypoint"] = [results["keypoint"], results["score"]]
-            im = visualize_pose(
-                frame, im_results, visual_thresh=self.threshold, returnimg=True)
+            im = visualize_pose(frame, im_results, visual_thresh=self.threshold, returnimg=True)
             writer.write(im)
             if camera_id != -1:
                 cv2.imshow("Mask Detection", im)
@@ -315,8 +310,7 @@ def check_model(self, yml_conf):
         for support_model in KEYPOINT_SUPPORT_MODELS:
             if support_model in yml_conf["arch"]:
                 return True
-        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
-            "arch"], KEYPOINT_SUPPORT_MODELS))
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], KEYPOINT_SUPPORT_MODELS))
 
     def print_config(self):
         print("-----------  Model Configuration -----------")
@@ -332,14 +326,10 @@ def visualize(image_list, results, visual_thresh=0.6, save_dir="output"):
     for i, image_file in enumerate(image_list):
         skeletons = results["keypoint"]
         scores = results["score"]
-        skeleton = skeletons[i:i + 1]
-        score = scores[i:i + 1]
+        skeleton = skeletons[i : i + 1]
+        score = scores[i : i + 1]
         im_results["keypoint"] = [skeleton, score]
-        visualize_pose(
-            image_file,
-            im_results,
-            visual_thresh=visual_thresh,
-            save_dir=save_dir)
+        visualize_pose(image_file, im_results, visual_thresh=visual_thresh, save_dir=save_dir)
 
 
 def main():
@@ -356,7 +346,8 @@ def main():
         enable_mkldnn=FLAGS.enable_mkldnn,
         threshold=FLAGS.threshold,
         output_dir=FLAGS.output_dir,
-        use_dark=FLAGS.use_dark, )
+        use_dark=FLAGS.use_dark,
+    )
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:
@@ -385,8 +376,7 @@ def main():
                 "shape": "dynamic_shape",
                 "data_num": perf_info["img_num"],
             }
-            det_log = PaddleInferBenchmark(detector.config, model_info,
-                                           data_info, perf_info, mems)
+            det_log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems)
             det_log("KeyPoint")
 
 
@@ -396,8 +386,7 @@ def main():
     FLAGS = parser.parse_args()
     print_arguments(FLAGS)
     FLAGS.device = FLAGS.device.upper()
-    assert FLAGS.device in ["CPU", "GPU", "XPU"
-                            ], "device should be CPU, GPU or XPU"
+    assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU"
     assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
 
     main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
index 01aa825cb00ee..8ba1f6a47b0cd 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
@@ -50,14 +50,11 @@ def lerp(self, j, y, x, heatmap):
         right = np.clip(x + 1, 0, W - 1)
         up = np.clip(y - 1, 0, H - 1)
         down = np.clip(y + 1, 0, H - 1)
-        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
-                            -0.25)
-        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
-                            -0.25)
+        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, -0.25)
+        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, -0.25)
         return offset_y + 0.5, offset_x + 0.5
 
-    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
-                 original_width):
+    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, original_width):
 
         N, J, H, W = heatmap.shape
         assert N == 1, "only support batch size 1"
@@ -67,8 +64,9 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
         inds_np = inds_k[0]
         y = inds_np // W
         x = inds_np % W
-        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),
-                      y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
+        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), y.flatten(), x.flatten()].reshape(
+            J, -1, tagmap.shape[-1]
+        )
         coords = np.stack((y, x), axis=2)
         # threshold
         mask = heats > self.heat_thresh
@@ -94,11 +92,8 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
                     cluster[key]["scores"][jid] = heats[jid, i]
                     cluster[key]["coords"][jid] = coords[jid, i]
                 continue
-            candidates = list(cluster.keys())[:self.max_num_people]
-            centroids = [
-                np.mean(
-                    cluster[k]["tags"], axis=0) for k in candidates
-            ]
+            candidates = list(cluster.keys())[: self.max_num_people]
+            centroids = [np.mean(cluster[k]["tags"], axis=0) for k in candidates]
             num_clusters = len(centroids)
             # shape is (num_valid, num_clusters, tag_dim)
             dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
@@ -111,12 +106,12 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
                     cost,
                     ((0, 0), (0, num_valid - num_clusters)),
                     "constant",
-                    constant_values=((0, 0), (0, 1e-10)), )
+                    constant_values=((0, 0), (0, 1e-10)),
+                )
             rows, cols = linear_sum_assignment(cost)
             for y, x in zip(rows, cols):
                 tag = tags[jid, y]
-                if (y < num_valid and x < num_clusters and
-                        l2_dist[y, x] < self.tag_thresh):
+                if y < num_valid and x < num_clusters and l2_dist[y, x] < self.tag_thresh:
                     key = candidates[x]  # merge to cluster
                 else:
                     key = tag[0]  # initialize new cluster
@@ -151,7 +146,7 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
         if True:
             for pid, coords in enumerate(pose_coords):
                 tag_mean = np.array(pose_tags[pid]).mean(axis=0)
-                norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
+                norm = np.sum((tagmap - tag_mean) ** 2, axis=3) ** 0.5
                 score = heatmap - np.round(norm)  # (J, H, W)
                 flat_score = score.reshape(J, -1)
                 max_inds = np.argmax(flat_score, axis=1)
@@ -167,9 +162,7 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
                 pose_coords[pid][salvage_joints, 0] = y
                 pose_coords[pid][salvage_joints, 1] = x
                 pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
-        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
-                                       original_height, original_width,
-                                       min(H, W))
+        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], original_height, original_width, min(H, W))
         return pose_kpts, mean_score
 
 
@@ -193,9 +186,7 @@ def warp_affine_joints(joints, mat):
     joints = np.array(joints)
     shape = joints.shape
     joints = joints.reshape(-1, 2)
-    return np.dot(np.concatenate(
-        (joints, joints[:, 0:1] * 0 + 1), axis=1),
-                  mat.T).reshape(shape)
+    return np.dot(np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), mat.T).reshape(shape)
 
 
 class HRNetPostProcess(object):
@@ -203,9 +194,7 @@ def __init__(self, use_dark=True):
         self.use_dark = use_dark
 
     def flip_back(self, output_flipped, matched_parts):
-        assert (
-            output_flipped.ndim == 4
-        ), "output_flipped should be [batch_size, num_joints, height, width]"
+        assert output_flipped.ndim == 4, "output_flipped should be [batch_size, num_joints, height, width]"
 
         output_flipped = output_flipped[:, :, :, ::-1]
 
@@ -226,8 +215,7 @@ def get_max_preds(self, heatmaps):
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
         """
-        assert isinstance(heatmaps,
-                          np.ndarray), "heatmaps should be numpy.ndarray"
+        assert isinstance(heatmaps, np.ndarray), "heatmaps should be numpy.ndarray"
         assert heatmaps.ndim == 4, "batch_images should be 4-ndim"
 
         batch_size = heatmaps.shape[0]
@@ -277,10 +265,8 @@ def dark_parse(self, hm, coord):
             dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
             dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
             dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
-            dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] -
-                          hm[py + 1][px - 1] + hm[py - 1][px - 1])
-            dyy = 0.25 * (
-                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
+            dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] - hm[py + 1][px - 1] + hm[py - 1][px - 1])
+            dyy = 0.25 * (hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
             derivative = np.matrix([[dx], [dy]])
             hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
             if dxx * dyy - dxy**2 != 0:
@@ -331,25 +317,24 @@ def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
                     px = int(math.floor(coords[n][p][0] + 0.5))
                     py = int(math.floor(coords[n][p][1] + 0.5))
                     if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
-                        diff = np.array([
-                            hm[py][px + 1] - hm[py][px - 1],
-                            hm[py + 1][px] - hm[py - 1][px],
-                        ])
+                        diff = np.array(
+                            [
+                                hm[py][px + 1] - hm[py][px - 1],
+                                hm[py + 1][px] - hm[py - 1][px],
+                            ]
+                        )
                         coords[n][p] += np.sign(diff) * 0.25
         preds = coords.copy()
 
         # Transform back
         for i in range(coords.shape[0]):
-            preds[i] = transform_preds(coords[i], center[i], scale[i],
-                                       [heatmap_width, heatmap_height])
+            preds[i] = transform_preds(coords[i], center[i], scale[i], [heatmap_width, heatmap_height])
 
         return preds, maxvals
 
     def __call__(self, output, center, scale):
         preds, maxvals = self.get_final_preds(output, center, scale)
-        return np.concatenate(
-            (preds, maxvals), axis=-1), np.mean(
-                maxvals, axis=1)
+        return np.concatenate((preds, maxvals), axis=-1), np.mean(maxvals, axis=1)
 
 
 def transform_preds(coords, center, scale, output_size):
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
index 86bf7e57c6605..68173f62bd043 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
@@ -48,18 +48,12 @@ def get_affine_mat_kernel(h, w, s, inv=False):
     center = np.array([np.round(w / 2.0), np.round(h / 2.0)])
 
     size_resized = (w_, h_)
-    trans = get_affine_transform(
-        center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
+    trans = get_affine_transform(center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
 
     return trans, size_resized
 
 
-def get_affine_transform(center,
-                         input_size,
-                         rot,
-                         output_size,
-                         shift=(0.0, 0.0),
-                         inv=False):
+def get_affine_transform(center, input_size, rot, output_size, shift=(0.0, 0.0), inv=False):
     """Get the affine transform matrix, given the center/scale/rot/output_size.
 
     Args:
@@ -134,13 +128,13 @@ def get_warp_matrix(theta, size_input, size_dst, size_target):
     matrix[0, 0] = np.cos(theta) * scale_x
     matrix[0, 1] = -np.sin(theta) * scale_x
     matrix[0, 2] = scale_x * (
-        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
-        np.sin(theta) + 0.5 * size_target[0])
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * np.sin(theta) + 0.5 * size_target[0]
+    )
     matrix[1, 0] = np.sin(theta) * scale_y
     matrix[1, 1] = np.cos(theta) * scale_y
     matrix[1, 2] = scale_y * (
-        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
-        np.cos(theta) + 0.5 * size_target[1])
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * np.cos(theta) + 0.5 * size_target[1]
+    )
     return matrix
 
 
@@ -212,19 +206,22 @@ def __call__(self, image, im_info):
                 rot,
                 center * 2.0,
                 [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
-                scale, )
+                scale,
+            )
             image = cv2.warpAffine(
                 image,
                 trans,
                 (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR, )
+                flags=cv2.INTER_LINEAR,
+            )
         else:
             trans = get_affine_transform(center, scale, rot, self.trainsize)
             image = cv2.warpAffine(
                 image,
                 trans,
                 (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR, )
+                flags=cv2.INTER_LINEAR,
+            )
 
         return image, im_info
 
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
index e858fa5051eaf..aa9b060ce7059 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
@@ -41,8 +41,8 @@ def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
         rest_boxes = boxes[indexes, :]
         iou = iou_of(
             rest_boxes,
-            np.expand_dims(
-                current_box, axis=0), )
+            np.expand_dims(current_box, axis=0),
+        )
         indexes = indexes[iou <= iou_threshold]
 
     return box_scores[picked, :]
@@ -88,15 +88,16 @@ class PicoDetPostProcess(object):
     """
 
     def __init__(
-            self,
-            input_shape,
-            ori_shape,
-            scale_factor,
-            strides=[8, 16, 32, 64],
-            score_threshold=0.4,
-            nms_threshold=0.5,
-            nms_top_k=1000,
-            keep_top_k=100, ):
+        self,
+        input_shape,
+        ori_shape,
+        scale_factor,
+        strides=[8, 16, 32, 64],
+        score_threshold=0.4,
+        nms_threshold=0.5,
+        nms_top_k=1000,
+        keep_top_k=100,
+    ):
         self.ori_shape = ori_shape
         self.input_shape = input_shape
         self.scale_factor = scale_factor
@@ -113,15 +114,13 @@ def warp_boxes(self, boxes, ori_shape):
         if n:
             # warp points
             xy = np.ones((n * 4, 3))
-            xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
-                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
             # xy = xy @ M.T  # transform
             xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
             # create new boxes
             x = xy[:, [0, 2, 4, 6]]
             y = xy[:, [1, 3, 5, 7]]
-            xy = (np.concatenate(
-                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T)
+            xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
             # clip boxes
             xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
             xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
@@ -138,8 +137,7 @@ def __call__(self, scores, raw_boxes):
             # generate centers
             decode_boxes = []
             select_scores = []
-            for stride, box_distribute, score in zip(self.strides, raw_boxes,
-                                                     scores):
+            for stride, box_distribute, score in zip(self.strides, raw_boxes, scores):
                 box_distribute = box_distribute[batch_id]
                 score = score[batch_id]
                 # centers
@@ -162,7 +160,7 @@ def __call__(self, scores, raw_boxes):
 
                 # top K candidate
                 topk_idx = np.argsort(score.max(axis=1))[::-1]
-                topk_idx = topk_idx[:self.nms_top_k]
+                topk_idx = topk_idx[: self.nms_top_k]
                 center = center[topk_idx]
                 score = score[topk_idx]
                 box_distance = box_distance[topk_idx]
@@ -185,12 +183,12 @@ def __call__(self, scores, raw_boxes):
                 if probs.shape[0] == 0:
                     continue
                 subset_boxes = bboxes[mask, :]
-                box_probs = np.concatenate(
-                    [subset_boxes, probs.reshape(-1, 1)], axis=1)
+                box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
                 box_probs = hard_nms(
                     box_probs,
                     iou_threshold=self.nms_threshold,
-                    top_k=self.keep_top_k, )
+                    top_k=self.keep_top_k,
+                )
                 picked_box_probs.append(box_probs)
                 picked_labels.extend([class_index] * box_probs.shape[0])
 
@@ -202,24 +200,25 @@ def __call__(self, scores, raw_boxes):
                 picked_box_probs = np.concatenate(picked_box_probs)
 
                 # resize output boxes
-                picked_box_probs[:, :4] = self.warp_boxes(
-                    picked_box_probs[:, :4], self.ori_shape[batch_id])
-                im_scale = np.concatenate([
-                    self.scale_factor[batch_id][::-1],
-                    self.scale_factor[batch_id][::-1],
-                ])
+                picked_box_probs[:, :4] = self.warp_boxes(picked_box_probs[:, :4], self.ori_shape[batch_id])
+                im_scale = np.concatenate(
+                    [
+                        self.scale_factor[batch_id][::-1],
+                        self.scale_factor[batch_id][::-1],
+                    ]
+                )
                 picked_box_probs[:, :4] /= im_scale
                 # clas score box
                 out_boxes_list.append(
                     np.concatenate(
                         [
-                            np.expand_dims(
-                                np.array(picked_labels), axis=-1),
-                            np.expand_dims(
-                                picked_box_probs[:, 4], axis=-1),
+                            np.expand_dims(np.array(picked_labels), axis=-1),
+                            np.expand_dims(picked_box_probs[:, 4], axis=-1),
                             picked_box_probs[:, :4],
                         ],
-                        axis=1, ))
+                        axis=1,
+                    )
+                )
                 out_boxes_num.append(len(picked_labels))
 
         out_boxes_list = np.concatenate(out_boxes_list, axis=0)
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
index 1066879f2e9ad..e57404bfe6c10 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
@@ -64,16 +64,9 @@ def __call__(self, im, im_info):
             im_info (dict): info of processed image
         """
         im_scale_y, im_scale_x = self.generate_scale(im)
-        im = cv2.resize(
-            im,
-            None,
-            None,
-            fx=im_scale_x,
-            fy=im_scale_y,
-            interpolation=self.interp)
+        im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
         im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
-        im_info["scale_factor"] = np.array(
-            [im_scale_y, im_scale_x]).astype("float32")
+        im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
         return im, im_info
 
     def generate_scale(self, img):
@@ -140,16 +133,9 @@ def __call__(self, im, im_info):
         assert len(self.target_size) == 2
         assert self.target_size[0] > 0 and self.target_size[1] > 0
         im_scale_y, im_scale_x = self.generate_scale(im)
-        im = cv2.resize(
-            im,
-            None,
-            None,
-            fx=im_scale_x,
-            fy=im_scale_y,
-            interpolation=self.interp)
+        im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
         im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
-        im_info["scale_factor"] = np.array(
-            [im_scale_y, im_scale_x]).astype("float32")
+        im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
         return im, im_info
 
     def generate_scale(self, im):
@@ -189,12 +175,13 @@ class ShortSizeScale(object):
     """
 
     def __init__(
-            self,
-            short_size,
-            fixed_ratio=True,
-            keep_ratio=None,
-            do_round=False,
-            backend="pillow", ):
+        self,
+        short_size,
+        fixed_ratio=True,
+        keep_ratio=None,
+        do_round=False,
+        backend="pillow",
+    ):
         self.short_size = short_size
         assert (fixed_ratio and not keep_ratio) or (
             not fixed_ratio
@@ -236,10 +223,8 @@ def __call__(self, img):
                 oh = self.short_size
             else:
                 scale_factor = self.short_size / w
-                oh = (int(h * float(scale_factor) + 0.5)
-                      if self.do_round else int(h * self.short_size / w))
-                ow = (int(w * float(scale_factor) + 0.5)
-                      if self.do_round else int(w * self.short_size / h))
+                oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w)
+                ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h)
         else:
             oh = self.short_size
             if self.fixed_ratio:
@@ -248,10 +233,8 @@ def __call__(self, img):
                 ow = self.short_size
             else:
                 scale_factor = self.short_size / h
-                oh = (int(h * float(scale_factor) + 0.5)
-                      if self.do_round else int(h * self.short_size / w))
-                ow = (int(w * float(scale_factor) + 0.5)
-                      if self.do_round else int(w * self.short_size / h))
+                oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w)
+                ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h)
 
         if type(img) == np.ndarray:
             img = Image.fromarray(img, mode="RGB")
@@ -259,12 +242,9 @@ def __call__(self, img):
         if self.backend == "pillow":
             result_img = img.resize((ow, oh), Image.BILINEAR)
         elif self.backend == "cv2" and (self.keep_ratio is not None):
-            result_img = cv2.resize(
-                img, (ow, oh), interpolation=cv2.INTER_LINEAR)
+            result_img = cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR)
         else:
-            result_img = Image.fromarray(
-                cv2.resize(
-                    np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR))
+            result_img = Image.fromarray(cv2.resize(np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR))
 
         return result_img
 
@@ -313,7 +293,9 @@ class Permute(object):
         channel_first (bool): whether convert HWC to CHW
     """
 
-    def __init__(self, ):
+    def __init__(
+        self,
+    ):
         super(Permute, self).__init__()
 
     def __call__(self, im, im_info):
@@ -379,17 +361,15 @@ def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
         ratio = min(ratio_h, ratio_w)
         new_shape = (
             round(shape[1] * ratio),
-            round(shape[0] * ratio), )  # [width, height]
+            round(shape[0] * ratio),
+        )  # [width, height]
         padw = (width - new_shape[0]) / 2
         padh = (height - new_shape[1]) / 2
         top, bottom = round(padh - 0.1), round(padh + 0.1)
         left, right = round(padw - 0.1), round(padw + 0.1)
 
-        img = cv2.resize(
-            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
-        img = cv2.copyMakeBorder(
-            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
-            value=color)  # padded rectangular
+        img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded rectangular
         return img, ratio, padw, padh
 
     def __call__(self, im, im_info):
@@ -445,14 +425,15 @@ class WarpAffine(object):
     """Warp affine the image"""
 
     def __init__(
-            self,
-            keep_res=False,
-            pad=31,
-            input_h=512,
-            input_w=512,
-            scale=0.4,
-            shift=0.1,
-            down_ratio=4, ):
+        self,
+        keep_res=False,
+        pad=31,
+        input_h=512,
+        input_w=512,
+        scale=0.4,
+        shift=0.1,
+        down_ratio=4,
+    ):
         self.keep_res = keep_res
         self.pad = pad
         self.input_h = input_h
@@ -489,32 +470,32 @@ def __call__(self, im, im_info):
 
         trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
         img = cv2.resize(img, (w, h))
-        inp = cv2.warpAffine(
-            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
 
         if not self.keep_res:
             out_h = input_h // self.down_ratio
             out_w = input_w // self.down_ratio
             trans_output = get_affine_transform(c, s, 0, [out_w, out_h])
 
-            im_info.update({
-                "center": c,
-                "scale": s,
-                "out_height": out_h,
-                "out_width": out_w,
-                "inp_height": input_h,
-                "inp_width": input_w,
-                "trans_input": trans_input,
-                "trans_output": trans_output,
-            })
+            im_info.update(
+                {
+                    "center": c,
+                    "scale": s,
+                    "out_height": out_h,
+                    "out_width": out_w,
+                    "inp_height": input_h,
+                    "inp_width": input_w,
+                    "trans_input": trans_input,
+                    "trans_output": trans_output,
+                }
+            )
         return inp, im_info
 
 
 def preprocess(im, preprocess_ops):
     # process image by preprocess_ops
     im_info = {
-        "scale_factor": np.array(
-            [1.0, 1.0], dtype=np.float32),
+        "scale_factor": np.array([1.0, 1.0], dtype=np.float32),
         "im_shape": None,
     }
     im, im_info = decode_image(im, im_info)
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
index 179b3b366e15a..1d38777a4526c 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
@@ -46,8 +46,7 @@ def pad_right_down_corner(img, stride, padValue):
 def transfer(model, model_weights):
     transfered_model_weights = {}
     for weights_name in model.state_dict().keys():
-        transfered_model_weights[weights_name] = model_weights[".".join(
-            weights_name.split(".")[1:])]
+        transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])]
     return transfered_model_weights
 
 
@@ -113,11 +112,9 @@ def draw_bodypose(canvas, candidate, subset):
             X = candidate[index.astype(int), 1]
             mX = np.mean(X)
             mY = np.mean(Y)
-            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
             angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
-                                       (int(length / 2), stickwidth),
-                                       int(angle), 0, 360, 1)
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
             cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
             canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
     return canvas
@@ -156,9 +153,9 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
                     canvas,
                     (x1, y1),
                     (x2, y2),
-                    matplotlib.colors.hsv_to_rgb(
-                        [ie / float(len(edges)), 1.0, 1.0]) * 255,
-                    thickness=2, )
+                    matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255,
+                    thickness=2,
+                )
 
         for i, keyponit in enumerate(peaks):
             x, y = keyponit
@@ -171,7 +168,8 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False):
                     cv2.FONT_HERSHEY_SIMPLEX,
                     0.3,
                     (0, 0, 0),
-                    lineType=cv2.LINE_AA, )
+                    lineType=cv2.LINE_AA,
+                )
     return canvas
 
 
@@ -192,16 +190,14 @@ def hand_detect(candidate, subset, oriImg):
         hands = []
         # left hand
         if has_left:
-            left_shoulder_index, left_elbow_index, left_wrist_index = person[
-                [5, 6, 7]]
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
             x1, y1 = candidate[left_shoulder_index][:2]
             x2, y2 = candidate[left_elbow_index][:2]
             x3, y3 = candidate[left_wrist_index][:2]
             hands.append([x1, y1, x2, y2, x3, y3, True])
         # right hand
         if has_right:
-            right_shoulder_index, right_elbow_index, right_wrist_index = person[
-                [2, 3, 4]]
+            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
             x1, y1 = candidate[right_shoulder_index][:2]
             x2, y2 = candidate[right_elbow_index][:2]
             x3, y3 = candidate[right_wrist_index][:2]
@@ -216,8 +212,8 @@ def hand_detect(candidate, subset, oriImg):
             # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
             x = x3 + ratioWristElbow * (x3 - x2)
             y = y3 + ratioWristElbow * (y3 - y2)
-            distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2)
-            distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
+            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
             width = 1.0 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
             # x-y refers to the center --> offset to topLeft point
             # handRectangle.x -= handRectangle.width / 2.f;
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
index eba62c30d1e34..eb3856ca3a117 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
@@ -26,41 +26,40 @@ def argsparser():
         "--model_dir",
         type=str,
         default=None,
-        help=("Directory include:'model.pdiparams', 'model.pdmodel', "
-              "'infer_cfg.yml', created by tools/export_model.py."),
-        required=True, )
-    parser.add_argument(
-        "--image_file", type=str, default=None, help="Path of image file.")
+        help=(
+            "Directory include:'model.pdiparams', 'model.pdmodel', "
+            "'infer_cfg.yml', created by tools/export_model.py."
+        ),
+        required=True,
+    )
+    parser.add_argument("--image_file", type=str, default=None, help="Path of image file.")
     parser.add_argument(
         "--image_dir",
         type=str,
         default=None,
-        help="Dir of image file, `image_file` has a higher priority.", )
-    parser.add_argument(
-        "--batch_size", type=int, default=1, help="batch_size for inference.")
+        help="Dir of image file, `image_file` has a higher priority.",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch_size for inference.")
     parser.add_argument(
         "--video_file",
         type=str,
         default=None,
         help="Path of video file, `video_file` or `camera_id` has a highest priority.",
     )
-    parser.add_argument(
-        "--camera_id",
-        type=int,
-        default=-1,
-        help="device id of camera to predict.")
-    parser.add_argument(
-        "--threshold", type=float, default=0.5, help="Threshold of score.")
+    parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold of score.")
     parser.add_argument(
         "--output_dir",
         type=str,
         default="output",
-        help="Directory of output visualization files.", )
+        help="Directory of output visualization files.",
+    )
     parser.add_argument(
         "--run_mode",
         type=str,
         default="paddle",
-        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", )
+        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)",
+    )
     parser.add_argument(
         "--device",
         type=str,
@@ -71,74 +70,70 @@ def argsparser():
         "--use_gpu",
         type=ast.literal_eval,
         default=False,
-        help="Deprecated, please use `--device`.", )
+        help="Deprecated, please use `--device`.",
+    )
     parser.add_argument(
         "--run_benchmark",
         type=ast.literal_eval,
         default=False,
-        help="Whether to predict a image_file repeatedly for benchmark", )
+        help="Whether to predict a image_file repeatedly for benchmark",
+    )
     parser.add_argument(
         "--enable_mkldnn",
         type=ast.literal_eval,
         default=False,
-        help="Whether use mkldnn with CPU.", )
+        help="Whether use mkldnn with CPU.",
+    )
     parser.add_argument(
         "--enable_mkldnn_bfloat16",
         type=ast.literal_eval,
         default=False,
-        help="Whether use mkldnn bfloat16 inference with CPU.", )
-    parser.add_argument(
-        "--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
-    parser.add_argument(
-        "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
-    parser.add_argument(
-        "--trt_max_shape",
-        type=int,
-        default=1280,
-        help="max_shape for TensorRT.")
-    parser.add_argument(
-        "--trt_opt_shape",
-        type=int,
-        default=640,
-        help="opt_shape for TensorRT.")
+        help="Whether use mkldnn bfloat16 inference with CPU.",
+    )
+    parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
+    parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
+    parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.")
+    parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.")
     parser.add_argument(
         "--trt_calib_mode",
         type=bool,
         default=False,
-        help="If the model is produced by TRT offline quantitative "
-        "calibration, trt_calib_mode need to set True.", )
+        help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.",
+    )
     parser.add_argument(
         "--save_images",
         type=ast.literal_eval,
         default=True,
-        help="Save visualization image results.", )
-    parser.add_argument(
-        "--save_mot_txts",
-        action="store_true",
-        help="Save tracking results (txt).")
+        help="Save visualization image results.",
+    )
+    parser.add_argument("--save_mot_txts", action="store_true", help="Save tracking results (txt).")
     parser.add_argument(
         "--save_mot_txt_per_img",
         action="store_true",
-        help="Save tracking results (txt) for each image.", )
+        help="Save tracking results (txt) for each image.",
+    )
     parser.add_argument(
         "--scaled",
         type=bool,
         default=False,
-        help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 "
-        "True in general detector.", )
-    parser.add_argument(
-        "--tracker_config", type=str, default=None, help=("tracker donfig"))
+        help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " "True in general detector.",
+    )
+    parser.add_argument("--tracker_config", type=str, default=None, help=("tracker donfig"))
     parser.add_argument(
         "--reid_model_dir",
         type=str,
         default=None,
-        help=("Directory include:'model.pdiparams', 'model.pdmodel', "
-              "'infer_cfg.yml', created by tools/export_model.py."), )
+        help=(
+            "Directory include:'model.pdiparams', 'model.pdmodel', "
+            "'infer_cfg.yml', created by tools/export_model.py."
+        ),
+    )
     parser.add_argument(
         "--reid_batch_size",
         type=int,
         default=50,
-        help="max batch_size for reid model inference.", )
+        help="max batch_size for reid model inference.",
+    )
     parser.add_argument(
         "--use_dark",
         type=ast.literal_eval,
@@ -149,27 +144,32 @@ def argsparser():
         "--action_file",
         type=str,
         default=None,
-        help="Path of input file for action recognition.", )
+        help="Path of input file for action recognition.",
+    )
     parser.add_argument(
         "--window_size",
         type=int,
         default=50,
-        help="Temporal size of skeleton feature for action recognition.", )
+        help="Temporal size of skeleton feature for action recognition.",
+    )
     parser.add_argument(
         "--random_pad",
         type=ast.literal_eval,
         default=False,
-        help="Whether do random padding for action recognition.", )
+        help="Whether do random padding for action recognition.",
+    )
     parser.add_argument(
         "--save_results",
         action="store_true",
         default=False,
-        help="Whether save detection result to file using coco format", )
+        help="Whether save detection result to file using coco format",
+    )
     parser.add_argument(
         "--use_coco_category",
         action="store_true",
         default=False,
-        help="Whether to use the coco format dictionary `clsid2catid`", )
+        help="Whether to use the coco format dictionary `clsid2catid`",
+    )
     parser.add_argument(
         "--slice_infer",
         action="store_true",
@@ -180,13 +180,15 @@ def argsparser():
         nargs="+",
         type=int,
         default=[640, 640],
-        help="Height of the sliced image.", )
+        help="Height of the sliced image.",
+    )
     parser.add_argument(
         "--overlap_ratio",
         nargs="+",
         type=float,
         default=[0.25, 0.25],
-        help="Overlap height ratio of the sliced image.", )
+        help="Overlap height ratio of the sliced image.",
+    )
     parser.add_argument(
         "--combine_method",
         type=str,
@@ -197,12 +199,14 @@ def argsparser():
         "--match_threshold",
         type=float,
         default=0.6,
-        help="Combine method matching threshold.", )
+        help="Combine method matching threshold.",
+    )
     parser.add_argument(
         "--match_metric",
         type=str,
         default="ios",
-        help="Combine method matching metric, choose in ['iou', 'ios'].", )
+        help="Combine method matching metric, choose in ['iou', 'ios'].",
+    )
     return parser
 
 
@@ -254,38 +258,34 @@ def info(self, average=False):
             total_time = total_time + track_time
         total_time = round(total_time, 4)
         print("------------------ Inference Time Info ----------------------")
-        print("total_time(ms): {}, img_num: {}".format(total_time * 1000,
-                                                       self.img_num))
-        preprocess_time = (round(pre_time / max(1, self.img_num), 4)
-                           if average else pre_time)
-        postprocess_time = (round(post_time / max(1, self.img_num), 4)
-                            if average else post_time)
-        inference_time = (round(infer_time / max(1, self.img_num), 4)
-                          if average else infer_time)
-        tracking_time = (round(track_time / max(1, self.img_num), 4)
-                         if average else track_time)
+        print("total_time(ms): {}, img_num: {}".format(total_time * 1000, self.img_num))
+        preprocess_time = round(pre_time / max(1, self.img_num), 4) if average else pre_time
+        postprocess_time = round(post_time / max(1, self.img_num), 4) if average else post_time
+        inference_time = round(infer_time / max(1, self.img_num), 4) if average else infer_time
+        tracking_time = round(track_time / max(1, self.img_num), 4) if average else track_time
 
         average_latency = total_time / max(1, self.img_num)
         qps = 0
         if total_time > 0:
             qps = 1 / average_latency
-        print("average latency time(ms): {:.2f}, QPS: {:2f}".format(
-            average_latency * 1000, qps))
+        print("average latency time(ms): {:.2f}, QPS: {:2f}".format(average_latency * 1000, qps))
         if self.with_tracker:
             print(
-                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".
-                format(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".format(
                     preprocess_time * 1000,
                     inference_time * 1000,
                     postprocess_time * 1000,
-                    tracking_time * 1000, ))
+                    tracking_time * 1000,
+                )
+            )
         else:
             print(
-                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".
-                format(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".format(
                     preprocess_time * 1000,
                     inference_time * 1000,
-                    postprocess_time * 1000, ))
+                    postprocess_time * 1000,
+                )
+            )
 
     def report(self, average=False):
         dic = {}
@@ -294,18 +294,13 @@ def report(self, average=False):
         post_time = self.postprocess_time_s.value()
         track_time = self.tracking_time_s.value()
 
-        dic["preprocess_time_s"] = (round(pre_time / max(1, self.img_num), 4)
-                                    if average else pre_time)
-        dic["inference_time_s"] = (round(infer_time / max(1, self.img_num), 4)
-                                   if average else infer_time)
-        dic["postprocess_time_s"] = (round(post_time / max(1, self.img_num), 4)
-                                     if average else post_time)
+        dic["preprocess_time_s"] = round(pre_time / max(1, self.img_num), 4) if average else pre_time
+        dic["inference_time_s"] = round(infer_time / max(1, self.img_num), 4) if average else infer_time
+        dic["postprocess_time_s"] = round(post_time / max(1, self.img_num), 4) if average else post_time
         dic["img_num"] = self.img_num
         total_time = pre_time + infer_time + post_time
         if self.with_tracker:
-            dic["tracking_time_s"] = (
-                round(track_time / max(1, self.img_num), 4)
-                if average else track_time)
+            dic["tracking_time_s"] = round(track_time / max(1, self.img_num), 4) if average else track_time
             total_time = total_time + track_time
         dic["total_time_s"] = round(total_time, 4)
         return dic
@@ -513,10 +508,9 @@ def gaussian_radius(bbox_size, min_overlap):
 
 def gaussian2D(shape, sigma_x=1, sigma_y=1):
     m, n = [(ss - 1.0) / 2.0 for ss in shape]
-    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+    y, x = np.ogrid[-m : m + 1, -n : n + 1]
 
-    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
-                                                            sigma_y)))
+    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * sigma_y)))
     h[h < np.finfo(h.dtype).eps * h.max()] = 0
     return h
 
@@ -526,8 +520,7 @@ def draw_umich_gaussian(heatmap, center, radius, k=1):
     draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
     """
     diameter = 2 * radius + 1
-    gaussian = gaussian2D(
-        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+    gaussian = gaussian2D((diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
 
     x, y = int(center[0]), int(center[1])
 
@@ -536,9 +529,8 @@ def draw_umich_gaussian(heatmap, center, radius, k=1):
     left, right = min(x, radius), min(width - x, radius + 1)
     top, bottom = min(y, radius), min(height - y, radius + 1)
 
-    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
-                               radius + right]
+    masked_heatmap = heatmap[y - top : y + bottom, x - left : x + right]
+    masked_gaussian = gaussian[radius - top : radius + bottom, radius - left : radius + right]
     if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
         np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
     return heatmap
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
index 3fdd640c1969b..6ea9f1b4a241b 100644
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
+++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
@@ -42,8 +42,7 @@ def visualize_box_mask(im, results, labels, threshold=0.5):
     elif isinstance(im, np.ndarray):
         im = Image.fromarray(im)
     if "masks" in results and "boxes" in results and len(results["boxes"]) > 0:
-        im = draw_mask(
-            im, results["boxes"], results["masks"], labels, threshold=threshold)
+        im = draw_mask(im, results["boxes"], results["masks"], labels, threshold=threshold)
     if "boxes" in results and len(results["boxes"]) > 0:
         im = draw_box(im, results["boxes"], labels, threshold=threshold)
     if "segm" in results:
@@ -53,7 +52,8 @@ def visualize_box_mask(im, results, labels, threshold=0.5):
             results["label"],
             results["score"],
             labels,
-            threshold=threshold, )
+            threshold=threshold,
+        )
     return im
 
 
@@ -74,7 +74,7 @@ def get_color_map_list(num_classes):
             color_map[i * 3 + 2] |= ((lab >> 2) & 1) << (7 - j)
             j += 1
             lab >>= 3
-    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    color_map = [color_map[i : i + 3] for i in range(0, len(color_map), 3)]
     return color_map
 
 
@@ -141,40 +141,31 @@ def draw_box(im, np_boxes, labels, threshold=0.5):
 
         if len(bbox) == 4:
             xmin, ymin, xmax, ymax = bbox
-            print("class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],"
-                  "right_bottom:[{:.2f},{:.2f}]".format(
-                      int(clsid), score, xmin, ymin, xmax, ymax))
+            print(
+                "class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],"
+                "right_bottom:[{:.2f},{:.2f}]".format(int(clsid), score, xmin, ymin, xmax, ymax)
+            )
             # draw bbox
             draw.line(
-                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
-                 (xmin, ymin)],
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin)],
                 width=draw_thickness,
-                fill=color, )
+                fill=color,
+            )
         elif len(bbox) == 8:
             x1, y1, x2, y2, x3, y3, x4, y4 = bbox
-            draw.line(
-                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
-                width=2,
-                fill=color)
+            draw.line([(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], width=2, fill=color)
             xmin = min(x1, x2, x3, x4)
             ymin = min(y1, y2, y3, y4)
 
         # draw label
         text = "{} {:.4f}".format(labels[clsid], score)
         tw, th = draw.textsize(text)
-        draw.rectangle(
-            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
+        draw.rectangle([(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
         draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
     return im
 
 
-def draw_segm(im,
-              np_segms,
-              np_label,
-              np_score,
-              labels,
-              threshold=0.5,
-              alpha=0.7):
+def draw_segm(im, np_segms, np_label, np_score, labels, threshold=0.5, alpha=0.7):
     """
     Draw segmentation on image
     """
@@ -204,8 +195,7 @@ def draw_segm(im,
         sum_y = np.sum(mask, axis=1)
         y = np.where(sum_y > 0.5)[0]
         x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
-        cv2.rectangle(im, (x0, y0), (x1, y1),
-                      tuple(color_mask.astype("int32").tolist()), 1)
+        cv2.rectangle(im, (x0, y0), (x1, y1), tuple(color_mask.astype("int32").tolist()), 1)
         bbox_text = "%s %.2f" % (labels[clsid], score)
         t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
         cv2.rectangle(
@@ -213,7 +203,8 @@ def draw_segm(im,
             (x0, y0),
             (x0 + t_size[0], y0 - t_size[1] - 3),
             tuple(color_mask.astype("int32").tolist()),
-            -1, )
+            -1,
+        )
         cv2.putText(
             im,
             bbox_text,
@@ -222,7 +213,8 @@ def draw_segm(im,
             0.3,
             (0, 0, 0),
             1,
-            lineType=cv2.LINE_AA, )
+            lineType=cv2.LINE_AA,
+        )
     return Image.fromarray(im.astype("uint8"))
 
 
@@ -233,20 +225,20 @@ def get_color(idx):
 
 
 def visualize_pose(
-        imgfile,
-        results,
-        visual_thresh=0.6,
-        save_name="pose.jpg",
-        save_dir="output",
-        returnimg=False,
-        ids=None, ):
+    imgfile,
+    results,
+    visual_thresh=0.6,
+    save_name="pose.jpg",
+    save_dir="output",
+    returnimg=False,
+    ids=None,
+):
     try:
         import matplotlib.pyplot as plt
 
         plt.switch_backend("agg")
     except Exception as e:
-        print("Matplotlib not found, please install matplotlib."
-              "for example: `pip install matplotlib`.")
+        print("Matplotlib not found, please install matplotlib." "for example: `pip install matplotlib`.")
         raise e
     skeletons, _ = results["keypoint"]
     skeletons = np.array(skeletons)
@@ -323,8 +315,7 @@ def visualize_pose(
         bboxs = results["bbox"]
         for j, rect in enumerate(bboxs):
             xmin, ymin, xmax, ymax = rect
-            color = (colors[0] if color_set is None else
-                     colors[color_set[j] % len(colors)])
+            color = colors[0] if color_set is None else colors[color_set[j] % len(colors)]
             cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
 
     canvas = img.copy()
@@ -333,8 +324,7 @@ def visualize_pose(
             if skeletons[j][i, 2] < visual_thresh:
                 continue
             if ids is None:
-                color = (colors[i] if color_set is None else
-                         colors[color_set[j] % len(colors)])
+                color = colors[i] if color_set is None else colors[color_set[j] % len(colors)]
             else:
                 color = get_color(ids[j])
 
@@ -343,15 +333,15 @@ def visualize_pose(
                 tuple(skeletons[j][i, 0:2].astype("int32")),
                 2,
                 color,
-                thickness=-1, )
+                thickness=-1,
+            )
 
     stickwidth = 2
 
     for i in range(NUM_EDGES):
         for j in range(len(skeletons)):
             edge = EDGES[i]
-            if (skeletons[j][edge[0], 2] < visual_thresh or
-                    skeletons[j][edge[1], 2] < visual_thresh):
+            if skeletons[j][edge[0], 2] < visual_thresh or skeletons[j][edge[1], 2] < visual_thresh:
                 continue
 
             cur_canvas = canvas.copy()
@@ -359,22 +349,18 @@ def visualize_pose(
             Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
             mX = np.mean(X)
             mY = np.mean(Y)
-            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
             angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
-                                       (int(length / 2), stickwidth),
-                                       int(angle), 0, 360, 1)
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
             if ids is None:
-                color = (colors[i] if color_set is None else
-                         colors[color_set[j] % len(colors)])
+                color = colors[i] if color_set is None else colors[color_set[j] % len(colors)]
             else:
                 color = get_color(ids[j])
             cv2.fillConvexPoly(cur_canvas, polygon, color)
             canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
     if returnimg:
         return canvas
-    save_name = os.path.join(
-        save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg")
+    save_name = os.path.join(save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg")
     plt.imsave(save_name, canvas[:, :, ::-1])
     print("keypoint visualize image saved to: " + save_name)
     plt.close()
@@ -414,5 +400,6 @@ def visualize_attr(im, results, boxes=None, is_mtmct=False):
                 cv2.FONT_ITALIC,
                 text_scale,
                 (0, 255, 255),
-                thickness=text_thickness, )
+                thickness=text_thickness,
+            )
     return im
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
index 53102b4c87bb4..1284578b851f1 100644
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
@@ -32,71 +32,80 @@ def parse_args():
     parser = argparse.ArgumentParser(description="Model prediction")
 
     # params of prediction
-    parser.add_argument(
-        "--config", dest="cfg", help="The config file.", default=None, type=str)
+    parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str)
     parser.add_argument(
         "--model_path",
         dest="model_path",
         help="The path of model for prediction",
         type=str,
-        default=None, )
+        default=None,
+    )
     parser.add_argument(
         "--image_path",
         dest="image_path",
         help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images",
         type=str,
-        default=None, )
+        default=None,
+    )
     parser.add_argument(
         "--save_dir",
         dest="save_dir",
         help="The directory for saving the predicted results",
         type=str,
-        default="./output/result", )
+        default="./output/result",
+    )
 
     # augment for prediction
     parser.add_argument(
         "--aug_pred",
         dest="aug_pred",
         help="Whether to use mulit-scales and flip augment for prediction",
-        action="store_true", )
+        action="store_true",
+    )
     parser.add_argument(
         "--scales",
         dest="scales",
         nargs="+",
         help="Scales for augment",
         type=float,
-        default=1.0, )
+        default=1.0,
+    )
     parser.add_argument(
         "--flip_horizontal",
         dest="flip_horizontal",
         help="Whether to use flip horizontally augment",
-        action="store_true", )
+        action="store_true",
+    )
     parser.add_argument(
         "--flip_vertical",
         dest="flip_vertical",
         help="Whether to use flip vertically augment",
-        action="store_true", )
+        action="store_true",
+    )
 
     # sliding window prediction
     parser.add_argument(
         "--is_slide",
         dest="is_slide",
         help="Whether to prediction by sliding window",
-        action="store_true", )
+        action="store_true",
+    )
     parser.add_argument(
         "--crop_size",
         dest="crop_size",
         nargs=2,
         help="The crop size of sliding window, the first is width and the second is height.",
         type=int,
-        default=None, )
+        default=None,
+    )
     parser.add_argument(
         "--stride",
         dest="stride",
         nargs=2,
         help="The stride of sliding window, the first is width and the second is height.",
         type=int,
-        default=None, )
+        default=None,
+    )
 
     # custom color map
     parser.add_argument(
@@ -105,7 +114,8 @@ def parse_args():
         nargs="+",
         help="Save images with a custom color map. Default: None, use paddleseg's default color map.",
         type=int,
-        default=None, )
+        default=None,
+    )
 
     # set device
     parser.add_argument(
@@ -113,7 +123,8 @@ def parse_args():
         dest="device",
         help="Device place to be set, which can be GPU, XPU, NPU, CPU",
         default="gpu",
-        type=str, )
+        type=str,
+    )
 
     return parser.parse_args()
 
@@ -301,8 +312,7 @@ def get_test_config(cfg, args):
 def main(args):
     env_info = get_sys_env()
 
-    if (args.device == "gpu" and env_info["Paddle compiled with cuda"] and
-            env_info["GPUs used"]):
+    if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]:
         place = "gpu"
     elif args.device == "xpu" and paddle.is_compiled_with_xpu():
         place = "xpu"
@@ -337,10 +347,13 @@ def main(args):
         image_list=image_list,
         image_dir=image_dir,
         save_dir=args.save_dir,
-        **test_config, )
+        **test_config,
+    )
 
 
-checkpoint_file = "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams"
+checkpoint_file = (
+    "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams"
+)
 
 
 class SegformerDetector:
@@ -350,27 +363,21 @@ def __init__(self, mode):
             "ade20k",
         ], f"mode should in {['cityscapes', 'ade20k']}!"
         if mode == "cityscapes":
-            segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path,
-                                                          "segformer_model")
-            modelpath = os.path.join(segformer_annotator_ckpts_path,
-                                     "model.pdparams")
+            segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model")
+            modelpath = os.path.join(segformer_annotator_ckpts_path, "model.pdparams")
             if not os.path.exists(modelpath):
-                from paddlenlp.utils.downloader import \
-                    get_path_from_url_with_filelock
+                from paddlenlp.utils.downloader import get_path_from_url_with_filelock
 
-                get_path_from_url_with_filelock(
-                    checkpoint_file, root_dir=segformer_annotator_ckpts_path)
+                get_path_from_url_with_filelock(checkpoint_file, root_dir=segformer_annotator_ckpts_path)
             self.model_path = modelpath
 
-            cfg = (
-                "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml"
-            )
+            cfg = "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml"
         else:
-            segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path,
-                                                          "segformer_model")
+            segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model")
             modelpath = os.path.join(
                 segformer_annotator_ckpts_path,
-                "segformer_b5_ade20k_512x512_160k.pdparams", )
+                "segformer_b5_ade20k_512x512_160k.pdparams",
+            )
 
             self.model_path = modelpath
 
@@ -404,9 +411,9 @@ def __call__(self, img):
             save_dir="output",
             skip_save=True,
             custom_color=custom_color_flatten,
-            **self.test_config, )
-        pred_mask = cv2.cvtColor(
-            np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
+            **self.test_config,
+        )
+        pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
 
         return pred_mask
 
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
index 6077f36175759..5e1850259a3f1 100644
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
+++ b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
@@ -33,7 +33,7 @@ def mkdir(path):
 def partition_list(arr, m):
     """split the list 'arr' into m pieces"""
     n = int(math.ceil(len(arr) / float(m)))
-    return [arr[i:i + n] for i in range(0, len(arr), n)]
+    return [arr[i : i + n] for i in range(0, len(arr), n)]
 
 
 def preprocess(im_path, transforms):
@@ -47,20 +47,21 @@ def preprocess(im_path, transforms):
 
 
 def predict(
-        model,
-        model_path,
-        transforms,
-        image_list,
-        image_dir=None,
-        save_dir="output",
-        aug_pred=False,
-        scales=1.0,
-        flip_horizontal=True,
-        flip_vertical=False,
-        is_slide=False,
-        stride=None,
-        crop_size=None,
-        custom_color=None, ):
+    model,
+    model_path,
+    transforms,
+    image_list,
+    image_dir=None,
+    save_dir="output",
+    aug_pred=False,
+    scales=1.0,
+    flip_horizontal=True,
+    flip_vertical=False,
+    is_slide=False,
+    stride=None,
+    crop_size=None,
+    custom_color=None,
+):
     """
     predict and visualize the image_list.
 
@@ -112,7 +113,8 @@ def predict(
                     flip_vertical=flip_vertical,
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             else:
                 pred, _ = infer.inference(
                     model,
@@ -120,7 +122,8 @@ def predict(
                     trans_info=data["trans_info"],
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             pred = paddle.squeeze(pred)
             pred = pred.numpy().astype("uint8")
 
@@ -133,16 +136,14 @@ def predict(
                 im_file = im_file[1:]
 
             # save added image
-            added_image = utils.visualize.visualize(
-                im_path, pred, color_map, weight=0.6)
+            added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
             added_image_path = os.path.join(added_saved_dir, im_file)
             mkdir(added_image_path)
             cv2.imwrite(added_image_path, added_image)
 
             # save pseudo color prediction
             pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
-            pred_saved_path = os.path.join(
-                pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+            pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
             mkdir(pred_saved_path)
             pred_mask.save(pred_saved_path)
 
@@ -151,21 +152,22 @@ def predict(
 
 
 def quick_predict(
-        model,
-        model_path,
-        transforms,
-        image_list,
-        image_dir=None,
-        save_dir="output",
-        aug_pred=False,
-        scales=1.0,
-        flip_horizontal=True,
-        flip_vertical=False,
-        is_slide=False,
-        stride=None,
-        crop_size=None,
-        custom_color=None,
-        skip_save=True, ):
+    model,
+    model_path,
+    transforms,
+    image_list,
+    image_dir=None,
+    save_dir="output",
+    aug_pred=False,
+    scales=1.0,
+    flip_horizontal=True,
+    flip_vertical=False,
+    is_slide=False,
+    stride=None,
+    crop_size=None,
+    custom_color=None,
+    skip_save=True,
+):
     """
     predict and visualize the image_list.
 
@@ -218,7 +220,8 @@ def quick_predict(
                     flip_vertical=flip_vertical,
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             else:
                 pred, _ = infer.inference(
                     model,
@@ -226,7 +229,8 @@ def quick_predict(
                     trans_info=data["trans_info"],
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             pred = paddle.squeeze(pred)
             pred = pred.numpy().astype("uint8")
 
@@ -241,8 +245,7 @@ def quick_predict(
 
             # save added image
             if not skip_save:
-                added_image = utils.visualize.visualize(
-                    im_path, pred, color_map, weight=0.6)
+                added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
                 added_image_path = os.path.join(added_saved_dir, im_file)
                 mkdir(added_image_path)
                 cv2.imwrite(added_image_path, added_image)
@@ -250,8 +253,7 @@ def quick_predict(
             # save pseudo color prediction
             pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
             if not skip_save:
-                pred_saved_path = os.path.join(
-                    pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+                pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
                 mkdir(pred_saved_path)
                 pred_mask.save(pred_saved_path)
 
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
index 76919bda8b88c..5d041d259a4ad 100644
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
+++ b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
@@ -32,71 +32,80 @@ def parse_args():
     parser = argparse.ArgumentParser(description="Model prediction")
 
     # params of prediction
-    parser.add_argument(
-        "--config", dest="cfg", help="The config file.", default=None, type=str)
+    parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str)
     parser.add_argument(
         "--model_path",
         dest="model_path",
         help="The path of model for prediction",
         type=str,
-        default=None, )
+        default=None,
+    )
     parser.add_argument(
         "--image_path",
         dest="image_path",
         help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images",
         type=str,
-        default=None, )
+        default=None,
+    )
     parser.add_argument(
         "--save_dir",
         dest="save_dir",
         help="The directory for saving the predicted results",
         type=str,
-        default="./output/result", )
+        default="./output/result",
+    )
 
     # augment for prediction
     parser.add_argument(
         "--aug_pred",
         dest="aug_pred",
         help="Whether to use mulit-scales and flip augment for prediction",
-        action="store_true", )
+        action="store_true",
+    )
     parser.add_argument(
         "--scales",
         dest="scales",
         nargs="+",
         help="Scales for augment",
         type=float,
-        default=1.0, )
+        default=1.0,
+    )
     parser.add_argument(
         "--flip_horizontal",
         dest="flip_horizontal",
         help="Whether to use flip horizontally augment",
-        action="store_true", )
+        action="store_true",
+    )
     parser.add_argument(
         "--flip_vertical",
         dest="flip_vertical",
         help="Whether to use flip vertically augment",
-        action="store_true", )
+        action="store_true",
+    )
 
     # sliding window prediction
     parser.add_argument(
         "--is_slide",
         dest="is_slide",
         help="Whether to prediction by sliding window",
-        action="store_true", )
+        action="store_true",
+    )
     parser.add_argument(
         "--crop_size",
         dest="crop_size",
         nargs=2,
         help="The crop size of sliding window, the first is width and the second is height.",
         type=int,
-        default=None, )
+        default=None,
+    )
     parser.add_argument(
         "--stride",
         dest="stride",
         nargs=2,
         help="The stride of sliding window, the first is width and the second is height.",
         type=int,
-        default=None, )
+        default=None,
+    )
 
     # custom color map
     parser.add_argument(
@@ -105,7 +114,8 @@ def parse_args():
         nargs="+",
         help="Save images with a custom color map. Default: None, use paddleseg's default color map.",
         type=int,
-        default=None, )
+        default=None,
+    )
 
     # set device
     parser.add_argument(
@@ -113,7 +123,8 @@ def parse_args():
         dest="device",
         help="Device place to be set, which can be GPU, XPU, NPU, CPU",
         default="gpu",
-        type=str, )
+        type=str,
+    )
 
     return parser.parse_args()
 
@@ -301,8 +312,7 @@ def get_test_config(cfg, args):
 def main(args):
     env_info = get_sys_env()
 
-    if (args.device == "gpu" and env_info["Paddle compiled with cuda"] and
-            env_info["GPUs used"]):
+    if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]:
         place = "gpu"
     elif args.device == "xpu" and paddle.is_compiled_with_xpu():
         place = "xpu"
@@ -337,24 +347,23 @@ def main(args):
         image_list=image_list,
         image_dir=image_dir,
         save_dir=args.save_dir,
-        **test_config, )
+        **test_config,
+    )
 
 
-checkpoint_file = "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams"
+checkpoint_file = (
+    "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams"
+)
 
 
 class SegmenterDetector:
     def __init__(self):
-        segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path,
-                                                      "segmenter_model")
-        modelpath = os.path.join(segmenter_annotator_ckpts_path,
-                                 "model.pdparams")
+        segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segmenter_model")
+        modelpath = os.path.join(segmenter_annotator_ckpts_path, "model.pdparams")
         if not os.path.exists(modelpath):
-            from paddlenlp.utils.downloader import \
-                get_path_from_url_with_filelock
+            from paddlenlp.utils.downloader import get_path_from_url_with_filelock
 
-            get_path_from_url_with_filelock(
-                checkpoint_file, root_dir=segmenter_annotator_ckpts_path)
+            get_path_from_url_with_filelock(checkpoint_file, root_dir=segmenter_annotator_ckpts_path)
         self.model_path = modelpath
 
         cfg = "annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml"
@@ -385,9 +394,9 @@ def __call__(self, img):
             save_dir="output",
             skip_save=True,
             custom_color=custom_color_flatten,
-            **self.test_config, )
-        pred_mask = cv2.cvtColor(
-            np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
+            **self.test_config,
+        )
+        pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
         return pred_mask
 
 
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
index 6077f36175759..5e1850259a3f1 100644
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
+++ b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
@@ -33,7 +33,7 @@ def mkdir(path):
 def partition_list(arr, m):
     """split the list 'arr' into m pieces"""
     n = int(math.ceil(len(arr) / float(m)))
-    return [arr[i:i + n] for i in range(0, len(arr), n)]
+    return [arr[i : i + n] for i in range(0, len(arr), n)]
 
 
 def preprocess(im_path, transforms):
@@ -47,20 +47,21 @@ def preprocess(im_path, transforms):
 
 
 def predict(
-        model,
-        model_path,
-        transforms,
-        image_list,
-        image_dir=None,
-        save_dir="output",
-        aug_pred=False,
-        scales=1.0,
-        flip_horizontal=True,
-        flip_vertical=False,
-        is_slide=False,
-        stride=None,
-        crop_size=None,
-        custom_color=None, ):
+    model,
+    model_path,
+    transforms,
+    image_list,
+    image_dir=None,
+    save_dir="output",
+    aug_pred=False,
+    scales=1.0,
+    flip_horizontal=True,
+    flip_vertical=False,
+    is_slide=False,
+    stride=None,
+    crop_size=None,
+    custom_color=None,
+):
     """
     predict and visualize the image_list.
 
@@ -112,7 +113,8 @@ def predict(
                     flip_vertical=flip_vertical,
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             else:
                 pred, _ = infer.inference(
                     model,
@@ -120,7 +122,8 @@ def predict(
                     trans_info=data["trans_info"],
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             pred = paddle.squeeze(pred)
             pred = pred.numpy().astype("uint8")
 
@@ -133,16 +136,14 @@ def predict(
                 im_file = im_file[1:]
 
             # save added image
-            added_image = utils.visualize.visualize(
-                im_path, pred, color_map, weight=0.6)
+            added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
             added_image_path = os.path.join(added_saved_dir, im_file)
             mkdir(added_image_path)
             cv2.imwrite(added_image_path, added_image)
 
             # save pseudo color prediction
             pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
-            pred_saved_path = os.path.join(
-                pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+            pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
             mkdir(pred_saved_path)
             pred_mask.save(pred_saved_path)
 
@@ -151,21 +152,22 @@ def predict(
 
 
 def quick_predict(
-        model,
-        model_path,
-        transforms,
-        image_list,
-        image_dir=None,
-        save_dir="output",
-        aug_pred=False,
-        scales=1.0,
-        flip_horizontal=True,
-        flip_vertical=False,
-        is_slide=False,
-        stride=None,
-        crop_size=None,
-        custom_color=None,
-        skip_save=True, ):
+    model,
+    model_path,
+    transforms,
+    image_list,
+    image_dir=None,
+    save_dir="output",
+    aug_pred=False,
+    scales=1.0,
+    flip_horizontal=True,
+    flip_vertical=False,
+    is_slide=False,
+    stride=None,
+    crop_size=None,
+    custom_color=None,
+    skip_save=True,
+):
     """
     predict and visualize the image_list.
 
@@ -218,7 +220,8 @@ def quick_predict(
                     flip_vertical=flip_vertical,
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             else:
                 pred, _ = infer.inference(
                     model,
@@ -226,7 +229,8 @@ def quick_predict(
                     trans_info=data["trans_info"],
                     is_slide=is_slide,
                     stride=stride,
-                    crop_size=crop_size, )
+                    crop_size=crop_size,
+                )
             pred = paddle.squeeze(pred)
             pred = pred.numpy().astype("uint8")
 
@@ -241,8 +245,7 @@ def quick_predict(
 
             # save added image
             if not skip_save:
-                added_image = utils.visualize.visualize(
-                    im_path, pred, color_map, weight=0.6)
+                added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
                 added_image_path = os.path.join(added_saved_dir, im_file)
                 mkdir(added_image_path)
                 cv2.imwrite(added_image_path, added_image)
@@ -250,8 +253,7 @@ def quick_predict(
             # save pseudo color prediction
             pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
             if not skip_save:
-                pred_saved_path = os.path.join(
-                    pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
+                pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
                 mkdir(pred_saved_path)
                 pred_mask.save(pred_saved_path)
 
diff --git a/ppdiffusers/examples/controlnet/annotator/util.py b/ppdiffusers/examples/controlnet/annotator/util.py
index 069005f683d59..7231c67ac5507 100644
--- a/ppdiffusers/examples/controlnet/annotator/util.py
+++ b/ppdiffusers/examples/controlnet/annotator/util.py
@@ -53,16 +53,15 @@ def resize_image(input_image, resolution):
     img = cv2.resize(
         input_image,
         (W, H),
-        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, )
+        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
+    )
     return img
 
 
 def make_noise_disk(H, W, C, F):
-    noise = np.random.uniform(
-        low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
-    noise = cv2.resize(
-        noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
-    noise = noise[F:F + H, F:F + W]
+    noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
+    noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
+    noise = noise[F : F + H, F : F + W]
     noise -= np.min(noise)
     noise /= np.max(noise)
     if C == 1:
diff --git a/ppdiffusers/examples/controlnet/control/control_args.py b/ppdiffusers/examples/controlnet/control/control_args.py
index 82e5c32ab1181..6a688687e1a27 100644
--- a/ppdiffusers/examples/controlnet/control/control_args.py
+++ b/ppdiffusers/examples/controlnet/control/control_args.py
@@ -22,44 +22,28 @@ class ModelArguments:
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
     """
 
-    vae_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "pretrained_vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "unet_encoder_name_or_path"})
+    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"})
+    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"})
     tokenizer_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as model_name"
-        }, )
-    model_max_length: Optional[int] = field(
-        default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    num_inference_steps: Optional[int] = field(
-        default=50, metadata={"help": "num_inference_steps"})
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+    num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"})
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     pretrained_model_name_or_path: str = field(
         default="runwayml/stable-diffusion-v1-5",
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
-    image_logging_steps: Optional[int] = field(
-        default=1000, metadata={"help": "Log image every X steps."})
-    sd_locked: bool = field(
-        default=True, metadata={"help": "lock unet output_blocks and out."})
-    use_paddle_conv_init: bool = field(
-        default=False,
-        metadata={"help": "Whether or not use paddle conv2d init."})
-    only_mid_control: bool = field(
-        default=False, metadata={"help": "only_mid_control."})
-    is_ldmbert: bool = field(
-        default=False, metadata={"help": "Whether to use ldmbert."})
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
+    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
+    sd_locked: bool = field(default=True, metadata={"help": "lock unet output_blocks and out."})
+    use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."})
+    only_mid_control: bool = field(default=False, metadata={"help": "only_mid_control."})
+    is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable_xformers_memory_efficient_attention."})
+        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+    )
 
 
 @dataclass
@@ -71,8 +55,7 @@ class DataArguments:
     resolution: int = field(
         default=512,
         metadata={
-            "help":
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        }, )
-    file_path: str = field(
-        default="./fill50k", metadata={"help": "The path to of the fill50k."})
+            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+        },
+    )
+    file_path: str = field(default="./fill50k", metadata={"help": "The path to of the fill50k."})
diff --git a/ppdiffusers/examples/controlnet/control/control_trainer.py b/ppdiffusers/examples/controlnet/control/control_trainer.py
index 506dfc88664cb..0b40903ded378 100644
--- a/ppdiffusers/examples/controlnet/control/control_trainer.py
+++ b/ppdiffusers/examples/controlnet/control/control_trainer.py
@@ -18,8 +18,11 @@
 
 import paddle.amp.auto_cast as autocast
 from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK,
-                                            VisualDLCallback, rewrite_logs)
+from paddlenlp.trainer.integrations import (
+    INTEGRATION_TO_CALLBACK,
+    VisualDLCallback,
+    rewrite_logs,
+)
 from paddlenlp.utils.log import logger
 
 from ppdiffusers.training_utils import unwrap_model
@@ -36,19 +39,17 @@ def autocast_smart_context_manager(self, args):
                     "c_softmax_with_cross_entropy",
                 ],
                 level=args.fp16_opt_level,
-                dtype=amp_dtype, )
+                dtype=amp_dtype,
+            )
         else:
-            ctx_manager = (contextlib.nullcontext()
-                           if sys.version_info >= (3, 7) else
-                           contextlib.suppress())
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
         return ctx_manager
 
     def on_step_end(self, args, state, control, model=None, **kwargs):
         if hasattr(model, "on_train_batch_end"):
             model.on_train_batch_end()
-        if (args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
             control.should_log = True
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -58,20 +59,22 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         inputs = kwargs.get("inputs", None)
         model = kwargs.get("model", None)
         image_logs = {}
-        if (inputs is not None and model is not None and
-                args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if (
+            inputs is not None
+            and model is not None
+            and args.image_logging_steps > 0
+            and state.global_step % args.image_logging_steps == 0
+        ):
             with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.decode_image(
-                    pixel_values=inputs["pixel_values"])
-                image_logs["control"] = model.decode_control_image(
-                    controlnet_cond=inputs["controlnet_cond"])
+                image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
+                image_logs["control"] = model.decode_control_image(controlnet_cond=inputs["controlnet_cond"])
                 image_logs["ddim-samples-9.0"] = model.log_image(
                     input_ids=inputs["input_ids"],
                     controlnet_cond=inputs["controlnet_cond"],
                     guidance_scale=9.0,
                     height=args.resolution,
-                    width=args.resolution, )
+                    width=args.resolution,
+                )
 
         if self.vdl_writer is None:
             self._init_summary_writer(args)
@@ -86,11 +89,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         "Trainer is attempting to log a value of "
                         f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                         "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute.")
+                        "is incorrect so we dropped this attribute."
+                    )
             # log images
             for k, v in image_logs.items():
-                self.vdl_writer.add_image(
-                    k, v, state.global_step, dataformats="NHWC")
+                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
             self.vdl_writer.flush()
 
 
@@ -103,14 +106,11 @@ def compute_loss(self, model, inputs, return_outputs=False):
         loss = model(**inputs)
         return loss
 
-    def _save(self,
-              output_dir=None,
-              state_dict=None,
-              merge_tensor_parallel=False):
+    def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
         super()._save(
             output_dir=output_dir,
             state_dict=state_dict,
-            merge_tensor_parallel=merge_tensor_parallel, )
+            merge_tensor_parallel=merge_tensor_parallel,
+        )
         output_dir = output_dir if output_dir is not None else self.args.output_dir
-        unwrap_model(self.model).controlnet.save_pretrained(
-            os.path.join(output_dir, "controlnet"))
+        unwrap_model(self.model).controlnet.save_pretrained(os.path.join(output_dir, "controlnet"))
diff --git a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
index 78c3c2bfdbf84..c67eca10fb034 100644
--- a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
+++ b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
@@ -35,7 +35,8 @@ def __init__(self, tokenizer, file_path="./fill50k"):
             padding="max_length",
             truncation=True,
             max_length=tokenizer.model_max_length,
-            return_tensors="np", ).input_ids[0]
+            return_tensors="np",
+        ).input_ids[0]
 
     def __len__(self):
         return len(self.data)
@@ -63,9 +64,7 @@ def __getitem__(self, idx):
         input_ids = self.text_processing(prompt)
 
         return dict(
-            input_ids=paddle.to_tensor(
-                input_ids, dtype=paddle.int64),
-            pixel_values=paddle.to_tensor(
-                target.transpose([2, 0, 1]), dtype=paddle.float32),
-            controlnet_cond=paddle.to_tensor(
-                source.transpose([2, 0, 1]), dtype=paddle.float32), )
+            input_ids=paddle.to_tensor(input_ids, dtype=paddle.int64),
+            pixel_values=paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32),
+            controlnet_cond=paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32),
+        )
diff --git a/ppdiffusers/examples/controlnet/control/model.py b/ppdiffusers/examples/controlnet/control/model.py
index de2bfb4ee5d47..c0d86532d5021 100644
--- a/ppdiffusers/examples/controlnet/control/model.py
+++ b/ppdiffusers/examples/controlnet/control/model.py
@@ -22,9 +22,15 @@
 from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
 from paddlenlp.utils.log import logger
 
-from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler,
-                         DDPMScheduler, LDMBertModel, UNet2DConditionModel,
-                         is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    DDPMScheduler,
+    LDMBertModel,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.initializer import reset_initialized_parameter
 from ppdiffusers.models.ema import LitEma
 from ppdiffusers.training_utils import freeze_params
@@ -42,18 +48,20 @@ def __init__(self, model_args):
         # init tokenizer
         tokenizer_name_or_path = (
             model_args.tokenizer_name
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+        )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path,
-            model_max_length=model_args.model_max_length)
+            tokenizer_name_or_path, model_max_length=model_args.model_max_length
+        )
 
         vae_name = "vqvae" if model_args.is_ldmbert else "vae"
         # init vae
         vae_name_or_path = (
             model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, vae_name))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, vae_name)
+        )
 
         self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
         freeze_params(self.vae.parameters())
@@ -62,55 +70,54 @@ def __init__(self, model_args):
         if model_args.is_ldmbert:
             text_encoder_name_or_path = (
                 model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None else
-                os.path.join(model_args.pretrained_model_name_or_path, "bert"))
+                if model_args.pretrained_model_name_or_path is None
+                else os.path.join(model_args.pretrained_model_name_or_path, "bert")
+            )
             # init text_encoder
-            self.text_encoder = LDMBertModel.from_pretrained(
-                text_encoder_name_or_path)
+            self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path)
         else:
             text_encoder_name_or_path = (
                 model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None else
-                os.path.join(model_args.pretrained_model_name_or_path,
-                             "text_encoder"))
-            self.text_encoder = CLIPTextModel.from_pretrained(
-                text_encoder_name_or_path)
+                if model_args.pretrained_model_name_or_path is None
+                else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+            )
+            self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
 
         freeze_params(self.text_encoder.parameters())
         logger.info("Freeze text_encoder parameters!")
 
         unet_name_or_path = (
             model_args.unet_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+        )
 
         self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
 
         freeze_params(self.unet.parameters())
         logger.info("Freeze unet parameters!")
 
-        self.controlnet = ControlNetModel.from_unet(
-            self.unet, load_weights_from_unet=True)
+        self.controlnet = ControlNetModel.from_unet(self.unet, load_weights_from_unet=True)
 
         if not model_args.use_paddle_conv_init:
             # use torch conv2d init
-            reset_initialized_parameter(
-                self.controlnet.controlnet_cond_embedding.conv_in)
-            reset_initialized_parameter(
-                self.controlnet.controlnet_cond_embedding.blocks)
+            reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.conv_in)
+            reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.blocks)
 
         self.noise_scheduler = DDPMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
-            num_train_timesteps=1000, )
+            num_train_timesteps=1000,
+        )
         self.eval_scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
         self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
         self.use_ema = model_args.use_ema
         if self.use_ema:
@@ -118,15 +125,15 @@ def __init__(self, model_args):
         self.control_scales = [1.0] * 13
         self.only_mid_control = model_args.only_mid_control
 
-        if (model_args.enable_xformers_memory_efficient_attention and
-                is_ppxformers_available()):
+        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
             try:
                 self.unet.enable_xformers_memory_efficient_attention()
                 self.controlnet.enable_xformers_memory_efficient_attention()
             except Exception as e:
                 logger.warn(
                     "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}")
+                    f" correctly and a GPU is available: {e}"
+                )
 
     @contextlib.contextmanager
     def ema_scope(self, context=None):
@@ -147,11 +154,7 @@ def on_train_batch_end(self):
         if self.use_ema:
             self.model_ema(self.controlnet)
 
-    def forward(self,
-                input_ids=None,
-                pixel_values=None,
-                controlnet_cond=None,
-                **kwargs):
+    def forward(self, input_ids=None, pixel_values=None, controlnet_cond=None, **kwargs):
         self.train()
         with paddle.amp.auto_cast(enable=False):
             with paddle.no_grad():
@@ -160,11 +163,10 @@ def forward(self,
                 latents = self.vae.encode(pixel_values).latent_dist.sample()
                 latents = latents * 0.18215
                 noise = paddle.randn(latents.shape)
-                timesteps = paddle.randint(
-                    0, self.noise_scheduler.num_train_timesteps,
-                    (latents.shape[0], )).astype("int64")
-                noisy_latents = self.noise_scheduler.add_noise(latents, noise,
-                                                               timesteps)
+                timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype(
+                    "int64"
+                )
+                noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
                 encoder_hidden_states = self.text_encoder(input_ids)[0]
         # control
         down_block_res_samples, mid_block_res_sample = self.controlnet(
@@ -173,7 +175,8 @@ def forward(self,
             encoder_hidden_states=encoder_hidden_states,
             controlnet_cond=controlnet_cond,
             conditioning_scale=self.control_scales,
-            return_dict=False, )
+            return_dict=False,
+        )
 
         # predict the noise residual
         noise_pred = self.unet(
@@ -181,7 +184,8 @@ def forward(self,
             timestep=timesteps,
             encoder_hidden_states=encoder_hidden_states,
             down_block_additional_residuals=down_block_res_samples,
-            mid_block_additional_residual=mid_block_res_sample, ).sample
+            mid_block_additional_residual=mid_block_res_sample,
+        ).sample
         loss = F.mse_loss(noise_pred, noise, reduction="mean")
         return loss
 
@@ -198,25 +202,23 @@ def decode_image(self, pixel_values=None, **kwargs):
 
     @paddle.no_grad()
     def decode_control_image(self, controlnet_cond=None, **kwargs):
-        return ((255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32")
-                .numpy().round())
+        return (255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round()
 
     @paddle.no_grad()
     def log_image(
-            self,
-            input_ids=None,
-            controlnet_cond=None,
-            height=512,
-            width=512,
-            eta=0.0,
-            guidance_scale=7.5,
-            **kwargs, ):
+        self,
+        input_ids=None,
+        controlnet_cond=None,
+        height=512,
+        width=512,
+        eta=0.0,
+        guidance_scale=7.5,
+        **kwargs,
+    ):
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log 8 image
             if input_ids.shape[0] > 4:
                 input_ids = input_ids[:4]
@@ -230,34 +232,30 @@ def log_image(
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
                 uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings], axis=0)
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
 
-            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels,
-                                    height // 8, width // 8))
+            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
             # ddim donot use this
             latents = latents * self.eval_scheduler.init_noise_sigma
 
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
 
-            controlnet_cond_input = (paddle.concat([controlnet_cond] * 2)
-                                     if do_classifier_free_guidance else
-                                     controlnet_cond)
+            controlnet_cond_input = (
+                paddle.concat([controlnet_cond] * 2) if do_classifier_free_guidance else controlnet_cond
+            )
 
             for t in self.eval_scheduler.timesteps:
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
 
                 # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
 
                 # ControlNet predict the noise residual
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
@@ -266,7 +264,8 @@ def log_image(
                     encoder_hidden_states=text_embeddings,
                     controlnet_cond=controlnet_cond_input,
                     conditioning_scale=self.control_scales,
-                    return_dict=False, )
+                    return_dict=False,
+                )
 
                 # predict the noise residual
                 noise_pred = self.unet(
@@ -274,17 +273,16 @@ def log_image(
                     t,
                     encoder_hidden_states=text_embeddings,
                     down_block_additional_residuals=down_block_res_samples,
-                    mid_block_additional_residual=mid_block_res_sample, ).sample
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             latents = 1 / 0.18215 * latents
             image = self.vae.decode(latents).sample
@@ -296,7 +294,6 @@ def set_recompute(self, value=False):
         def fn(layer):
             if hasattr(layer, "gradient_checkpointing"):
                 layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute",
-                      layer.gradient_checkpointing)
+                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
 
         self.controlnet.apply(fn)
diff --git a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
index 0cb439f90dd2b..17582dd93e648 100644
--- a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
+++ b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
@@ -24,13 +24,11 @@ def extract_controlnet_ema_weights(model_path, output_path):
     for k in state_dict.keys():
         if k.startswith("controlnet."):
             flat_ema_key = "model_ema." + "".join(k.split(".")[1:])
-            ema_state_dict[k.replace("controlnet.", "")] = state_dict.get(
-                flat_ema_key)
+            ema_state_dict[k.replace("controlnet.", "")] = state_dict.get(flat_ema_key)
     if len(ema_state_dict) == 0:
         raise ValueError("Can not extract ema weights!")
     os.makedirs(output_path, exist_ok=True)
-    paddle.save(ema_state_dict,
-                os.path.join(output_path, "model_state.ema.pdparams"))
+    paddle.save(ema_state_dict, os.path.join(output_path, "model_state.ema.pdparams"))
     print(f"Save EMA weights to {output_path} !")
 
 
@@ -40,11 +38,13 @@ def extract_controlnet_ema_weights(model_path, output_path):
         "--model_path",
         type=str,
         default="./model_state.pdparams",
-        help="model_state.", )
+        help="model_state.",
+    )
     parser.add_argument(
         "--output_path",
         type=str,
         default="ema_controlnet",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     extract_controlnet_ema_weights(args.model_path, args.output_path)
diff --git a/ppdiffusers/examples/controlnet/gradio_canny2image.py b/ppdiffusers/examples/controlnet/gradio_canny2image.py
index 5dc43a6ca4f8e..5c0ad9e936299 100644
--- a/ppdiffusers/examples/controlnet/gradio_canny2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_canny2image.py
@@ -27,39 +27,37 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-        low_threshold,
-        high_threshold, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+    low_threshold,
+    high_threshold,
+):
     with paddle.no_grad():
         img = resize_image(HWC3(input_image), image_resolution)
         H, W, C = img.shape
         detected_map = apply_canny(img, low_threshold, high_threshold)
         detected_map = HWC3(detected_map)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -75,7 +73,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [255 - detected_map] + results
@@ -91,59 +90,55 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 low_threshold = gr.Slider(
                     label="Canny low threshold",
                     minimum=1,
                     maximum=255,
                     value=100,
-                    step=1, )
+                    step=1,
+                )
                 high_threshold = gr.Slider(
                     label="Canny high threshold",
                     minimum=1,
                     maximum=255,
                     value=200,
-                    step=1, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=1,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_depth2image.py b/ppdiffusers/examples/controlnet/gradio_depth2image.py
index 63b50704b9bff..67f33684cc947 100644
--- a/ppdiffusers/examples/controlnet/gradio_depth2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_depth2image.py
@@ -28,37 +28,34 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
-        detected_map, _ = apply_midas(
-            resize_image(input_image, detect_resolution))
+        detected_map, _ = apply_midas(resize_image(input_image, detect_resolution))
         detected_map = HWC3(detected_map)
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         if seed == -1:
@@ -75,7 +72,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=1.0,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -91,53 +89,48 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="Depth Resolution",
                     minimum=128,
                     maximum=1024,
                     value=384,
-                    step=1, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=1,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_hed2image.py b/ppdiffusers/examples/controlnet/gradio_hed2image.py
index 87e37dccb3043..9394f85ba697d 100644
--- a/ppdiffusers/examples/controlnet/gradio_hed2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_hed2image.py
@@ -28,25 +28,25 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
         detected_map = apply_hed(resize_image(input_image, detect_resolution))
@@ -54,16 +54,13 @@ def process(
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
 
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -79,7 +76,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -95,53 +93,42 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(
-                    label="HED Resolution",
-                    minimum=128,
-                    maximum=1024,
-                    value=512,
-                    step=1)
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1)
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_hough2image.py b/ppdiffusers/examples/controlnet/gradio_hough2image.py
index eef44cc32f7b0..65ff6c1410769 100644
--- a/ppdiffusers/examples/controlnet/gradio_hough2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_hough2image.py
@@ -28,46 +28,44 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-        value_threshold,
-        distance_threshold, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+    value_threshold,
+    distance_threshold,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
         detected_map = apply_mlsd(
             resize_image(input_image, detect_resolution),
             value_threshold,
-            distance_threshold, )
+            distance_threshold,
+        )
         detected_map = HWC3(detected_map)
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -83,7 +81,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -99,65 +98,62 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="Hough Line Resolution",
                     minimum=128,
                     maximum=1024,
                     value=512,
-                    step=1, )
+                    step=1,
+                )
                 value_threshold = gr.Slider(
                     label="Hough value threshold (MLSD)",
                     minimum=0.01,
                     maximum=2.0,
                     value=0.1,
-                    step=0.01, )
+                    step=0.01,
+                )
                 distance_threshold = gr.Slider(
                     label="Hough distance threshold (MLSD)",
                     minimum=0.01,
                     maximum=20.0,
                     value=0.1,
-                    step=0.01, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=0.01,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
index 0d23830f2c4be..7f164b57c63be 100644
--- a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
@@ -23,41 +23,37 @@
 
 from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
 
-controlnet = ControlNetModel.from_pretrained(
-    "lllyasviel/control_v11e_sd15_ip2p")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_ip2p")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         img = resize_image(HWC3(input_image), image_resolution)
         detected_map = input_image.copy()
         H, W, C = img.shape
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -73,7 +69,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -89,47 +86,41 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_normal2image.py b/ppdiffusers/examples/controlnet/gradio_normal2image.py
index 6ce2e56d8ea3c..69bf238fe4521 100644
--- a/ppdiffusers/examples/controlnet/gradio_normal2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_normal2image.py
@@ -28,43 +28,39 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-        bg_threshold, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+    bg_threshold,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
-        _, detected_map = apply_midas(
-            resize_image(input_image, detect_resolution), bg_th=bg_threshold)
+        _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold)
         detected_map = HWC3(detected_map)
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -80,7 +76,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -96,59 +93,55 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="Normal Resolution",
                     minimum=128,
                     maximum=1024,
                     value=384,
-                    step=1, )
+                    step=1,
+                )
                 bg_threshold = gr.Slider(
                     label="Normal background threshold",
                     minimum=0.0,
                     maximum=1.0,
                     value=0.4,
-                    step=0.01, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=0.01,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
index 07a52bcf286d3..e932854042b60 100644
--- a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
+++ b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
@@ -26,45 +26,41 @@
 
 apply_openpose = OpenposePaddleDetector()
 
-controlnet = ControlNetModel.from_pretrained(
-    "lllyasviel/sd-controlnet-openpose")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        hand,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    hand,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
         detected_map, _ = apply_openpose(input_image, detect_resolution, hand)
         detected_map = HWC3(detected_map)
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -80,7 +76,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -97,53 +94,48 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="OpenPose Resolution",
                     minimum=128,
                     maximum=1024,
                     value=512,
-                    step=1, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=1,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         hand,
diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
index 51a713db003db..097bbd83516d3 100644
--- a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
+++ b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
@@ -26,45 +26,41 @@
 
 apply_ppdetpose = PPDetDetector()
 
-controlnet = ControlNetModel.from_pretrained(
-    "lllyasviel/sd-controlnet-openpose")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        hand,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    hand,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
         detected_map, _ = apply_ppdetpose(input_image, detect_resolution, hand)
         detected_map = HWC3(detected_map)
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -80,7 +76,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -97,53 +94,48 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="OpenPose Resolution",
                     minimum=128,
                     maximum=1024,
                     value=512,
-                    step=1, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=1,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         hand,
diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
index 0d89c0899ecb4..1e8bd335f71a5 100644
--- a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
+++ b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
@@ -28,42 +28,38 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
-        detected_map = apply_uniformer(
-            resize_image(input_image, detect_resolution))
+        detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
 
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -79,7 +75,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -95,53 +92,48 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="Segmentation Resolution",
                     minimum=128,
                     maximum=1024,
                     value=512,
-                    step=1, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=1,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
index b517ba3b94cc4..a99e82a4ea7e5 100644
--- a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
+++ b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
@@ -28,42 +28,38 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    detect_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         input_image = HWC3(input_image)
-        detected_map = apply_uniformer(
-            resize_image(input_image, detect_resolution))
+        detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
         img = resize_image(input_image, image_resolution)
         H, W, C = img.shape
 
-        detected_map = cv2.resize(
-            detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = (
-            [strength * (0.825**float(12 - i)) for i in range(13)]
-            if guess_mode else ([strength] * 13)
+            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
         )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -79,7 +75,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -95,53 +92,48 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
                 detect_resolution = gr.Slider(
                     label="Segmentation Resolution",
                     minimum=128,
                     maximum=1024,
                     value=512,
-                    step=1, )
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                    step=1,
+                )
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
index da5f05c890081..0e6313d0d407c 100644
--- a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
+++ b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
@@ -25,34 +25,32 @@
 
 apply_shuffle = ContentShuffleDetector()
 
-controlnet = ControlNetModel.from_pretrained(
-    "lllyasviel/control_v11e_sd15_shuffle")
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 
 def process(
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta, ):
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    ddim_steps,
+    guess_mode,
+    strength,
+    scale,
+    seed,
+    eta,
+):
     with paddle.no_grad():
         img = resize_image(HWC3(input_image), image_resolution)
         H, W, C = img.shape
         detected_map = apply_shuffle(img, w=W, h=H, f=256)
 
-        control = paddle.to_tensor(
-            detected_map.copy(), dtype=paddle.float32) / 255.0
+        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
         control = control.unsqueeze(0).transpose([0, 3, 1, 2])
 
         control_scales = [strength] * 13
@@ -70,7 +68,8 @@ def process(
                 width=W,
                 eta=eta,
                 controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale, ).images[0]
+                guidance_scale=scale,
+            ).images[0]
             results.append(img)
 
     return [detected_map] + results
@@ -86,47 +85,41 @@ def process(
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(
-                    label="Images", minimum=1, maximum=12, value=1, step=1)
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                 image_resolution = gr.Slider(
                     label="Image Resolution",
                     minimum=256,
                     maximum=768,
                     value=512,
-                    step=64, )
+                    step=64,
+                )
                 strength = gr.Slider(
                     label="Control Strength",
                     minimum=0.0,
                     maximum=2.0,
                     value=1.0,
-                    step=0.01, )
+                    step=0.01,
+                )
                 guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                ddim_steps = gr.Slider(
-                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                 scale = gr.Slider(
                     label="Guidance Scale",
                     minimum=0.1,
                     maximum=30.0,
                     value=9.0,
-                    step=0.1, )
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True)
+                    step=0.1,
+                )
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(
-                    label="Added Prompt",
-                    value="best quality, extremely detailed")
+                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
                 n_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 )
         with gr.Column():
-            result_gallery = gr.Gallery(
-                label="Output", show_label=False, elem_id="gallery").style(
-                    grid=2, height="auto")
+            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
+                grid=2, height="auto"
+            )
     ips = [
         input_image,
         prompt,
diff --git a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
index f94a1bebdee43..34910428889af 100644
--- a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
+++ b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
@@ -16,10 +16,14 @@
 import os
 
 import paddle
-from control import (ControlNet, ControlNetTrainer, DataArguments,
-                     Fill50kDataset, ModelArguments)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from control import (
+    ControlNet,
+    ControlNetTrainer,
+    DataArguments,
+    Fill50kDataset,
+    ModelArguments,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
 from paddlenlp.utils.log import logger
 
 
@@ -29,15 +33,14 @@ def unfreeze_params(params):
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, TrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # report to custom_visualdl
     training_args.report_to = ["custom_visualdl"]
     training_args.resolution = data_args.resolution
     training_args.image_logging_steps = model_args.image_logging_steps = (
-        math.ceil(model_args.image_logging_steps / training_args.logging_steps)
-        * training_args.logging_steps)
+        math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
+    )
     training_args.print_config(model_args, "Model")
     training_args.print_config(data_args, "Data")
 
@@ -45,16 +48,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -67,7 +68,8 @@ def main():
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
     # must set recompute after trainer init
     trainer.model.set_recompute(training_args.recompute)
 
@@ -76,7 +78,8 @@ def main():
             trainer.model.controlnet.parameters(),
             trainer.model.unet.up_blocks.parameters(),
             trainer.model.unet.conv_norm_out.parameters(),
-            trainer.model.unet.conv_out.parameters(), )
+            trainer.model.unet.conv_out.parameters(),
+        )
         unfreeze_params(params_to_train)
     else:
         params_to_train = trainer.model.controlnet.parameters()
diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth.py b/ppdiffusers/examples/dreambooth/train_dreambooth.py
index f9e184fbf53a3..1aedbd3d57952 100644
--- a/ppdiffusers/examples/dreambooth/train_dreambooth.py
+++ b/ppdiffusers/examples/dreambooth/train_dreambooth.py
@@ -29,10 +29,10 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
-    fused_allreduce_gradients
-from paddle.io import (BatchSampler, DataLoader, Dataset,
-                       DistributedBatchSampler)
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
+from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
 from paddle.optimizer import AdamW
 from paddle.vision import BaseTransform, transforms
 from paddlenlp.trainer import set_seed
@@ -41,8 +41,13 @@
 from PIL import Image
 from tqdm.auto import tqdm
 
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
-                         UNet2DConditionModel, is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.models.modeling_utils import freeze_params, unwrap_model
 from ppdiffusers.optimization import get_scheduler
 from ppdiffusers.utils import check_min_version
@@ -52,8 +57,7 @@
 
 
 def url_or_path_join(*path_list):
-    return (os.path.join(*path_list)
-            if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
 
 
 class Lambda(BaseTransform):
@@ -65,11 +69,11 @@ def _apply_image(self, img):
         return self.fn(img)
 
 
-def import_model_class_from_model_name_or_path(
-        pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
     try:
         text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+        )
         model_class = text_encoder_config.architectures[0]
     except Exception:
         model_class = "LDMBertModel"
@@ -78,8 +82,9 @@ def import_model_class_from_model_name_or_path(
 
         return CLIPTextModel
     elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
-            RobertaSeriesModelWithTransformation
+        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+            RobertaSeriesModelWithTransformation,
+        )
 
         return RobertaSeriesModelWithTransformation
     elif model_class == "BertModel":
@@ -87,8 +92,9 @@ def import_model_class_from_model_name_or_path(
 
         return BertModel
     elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
-            LDMBertModel
+        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+            LDMBertModel,
+        )
 
         return LDMBertModel
     else:
@@ -104,8 +110,7 @@ def fn(layer):
         # unet
         if hasattr(layer, "gradient_checkpointing"):
             layer.gradient_checkpointing = value
-            print("Set", layer.__class__, "recompute",
-                  layer.gradient_checkpointing)
+            print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
 
     model.apply(fn)
 
@@ -125,8 +130,7 @@ def get_report_to(args):
 
 
 def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training dreambooth script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training dreambooth script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -145,19 +149,22 @@ def parse_args(input_args=None):
         type=str,
         default=None,
         required=True,
-        help="A folder containing the training data of instance images.", )
+        help="A folder containing the training data of instance images.",
+    )
     parser.add_argument(
         "--class_data_dir",
         type=str,
         default=None,
         required=False,
-        help="A folder containing the training data of class images.", )
+        help="A folder containing the training data of class images.",
+    )
     parser.add_argument(
         "--instance_prompt",
         type=str,
         default=None,
         required=True,
-        help="The prompt with identifier specifying the instance", )
+        help="The prompt with identifier specifying the instance",
+    )
     parser.add_argument(
         "--class_prompt",
         type=str,
@@ -168,12 +175,14 @@ def parse_args(input_args=None):
         "--with_prior_preservation",
         default=False,
         action="store_true",
-        help="Flag to add prior preservation loss.", )
+        help="Flag to add prior preservation loss.",
+    )
     parser.add_argument(
         "--prior_loss_weight",
         type=float,
         default=1.0,
-        help="The weight of prior preservation loss.", )
+        help="The weight of prior preservation loss.",
+    )
     parser.add_argument(
         "--num_class_images",
         type=int,
@@ -181,39 +190,42 @@ def parse_args(input_args=None):
         help=(
             "Minimal class images for prior preservation loss. If there are not enough images already present in"
             " class_data_dir, additional images will be sampled with class_prompt."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
         default="./dreambooth-model",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--height",
         type=int,
         default=None,
         help=(
             "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"), )
+            " height"
+        ),
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
         help=(
             "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"), )
+            " width"
+        ),
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         default=512,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--center_crop",
         default=False,
@@ -221,11 +233,13 @@ def parse_args(input_args=None):
         help=(
             "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
             " cropped. The images will be resized to the resolution first before cropping."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--random_flip",
         action="store_true",
-        help="whether to randomly flip images horizontally", )
+        help="whether to randomly flip images horizontally",
+    )
     parser.add_argument(
         "--train_text_encoder",
         action="store_true",
@@ -235,12 +249,14 @@ def parse_args(input_args=None):
         "--train_batch_size",
         type=int,
         default=4,
-        help="Batch size (per device) for the training dataloader.", )
+        help="Batch size (per device) for the training dataloader.",
+    )
     parser.add_argument(
         "--sample_batch_size",
         type=int,
         default=4,
-        help="Batch size (per device) for sampling images.", )
+        help="Batch size (per device) for sampling images.",
+    )
     parser.add_argument("--num_train_epochs", type=int, default=1)
     parser.add_argument(
         "--max_train_steps",
@@ -277,12 +293,15 @@ def parse_args(input_args=None):
         default="constant",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'), )
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
     parser.add_argument(
         "--lr_warmup_steps",
         type=int,
         default=500,
-        help="Number of steps for the warmup in the lr scheduler.", )
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
     parser.add_argument(
         "--lr_num_cycles",
         type=int,
@@ -293,45 +312,47 @@ def parse_args(input_args=None):
         "--lr_power",
         type=float,
         default=1.0,
-        help="Power factor of the polynomial scheduler.", )
+        help="Power factor of the polynomial scheduler.",
+    )
     parser.add_argument(
         "--dataloader_num_workers",
         type=int,
         default=0,
         help=(
             "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--adam_beta1",
         type=float,
         default=0.9,
-        help="The beta1 parameter for the Adam optimizer.", )
+        help="The beta1 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_beta2",
         type=float,
         default=0.999,
-        help="The beta2 parameter for the Adam optimizer.", )
-    parser.add_argument(
-        "--adam_weight_decay",
-        type=float,
-        default=1e-2,
-        help="Weight decay to use.")
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
     parser.add_argument(
         "--adam_epsilon",
         type=float,
         default=1e-08,
-        help="Epsilon value for the Adam optimizer", )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
         "--push_to_hub",
         action="store_true",
-        help="Whether or not to push the model to the Hub.", )
+        help="Whether or not to push the model to the Hub.",
+    )
     parser.add_argument(
         "--hub_token",
         type=str,
         default=None,
-        help="The token to use to push to the Model Hub.", )
+        help="The token to use to push to the Model Hub.",
+    )
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -344,27 +365,28 @@ def parse_args(input_args=None):
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"), )
+            "*output_dir/logs"
+        ),
+    )
     parser.add_argument(
         "--report_to",
         type=str,
         default="visualdl",
         choices=["tensorboard", "visualdl"],
-        help="Log writer type.", )
+        help="Log writer type.",
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=int,
         default=500,
-        help=("Save a checkpoint of the training state every X updates."), )
+        help=("Save a checkpoint of the training state every X updates."),
+    )
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether or not to use xformers.", )
-    parser.add_argument(
-        "--noise_offset",
-        type=float,
-        default=1.0,
-        help="The scale of noise offset.")
+        help="Whether or not to use xformers.",
+    )
+    parser.add_argument("--noise_offset", type=float, default=1.0, help="The scale of noise offset.")
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -376,20 +398,15 @@ def parse_args(input_args=None):
 
     if args.with_prior_preservation:
         if args.class_data_dir is None:
-            raise ValueError(
-                "You must specify a data directory for class images.")
+            raise ValueError("You must specify a data directory for class images.")
         if args.class_prompt is None:
             raise ValueError("You must specify prompt for class images.")
     else:
         # logger is not available yet
         if args.class_data_dir is not None:
-            warnings.warn(
-                "You need not use --class_data_dir without --with_prior_preservation."
-            )
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
         if args.class_prompt is not None:
-            warnings.warn(
-                "You need not use --class_prompt without --with_prior_preservation."
-            )
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
 
     args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
     if args.height is None or args.width is None and args.resolution is not None:
@@ -405,18 +422,19 @@ class DreamBoothDataset(Dataset):
     """
 
     def __init__(
-            self,
-            instance_data_root,
-            instance_prompt,
-            tokenizer,
-            class_data_root=None,
-            class_prompt=None,
-            class_num=None,
-            height=512,
-            width=512,
-            center_crop=False,
-            interpolation="bilinear",
-            random_flip=False, ):
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        height=512,
+        width=512,
+        center_crop=False,
+        interpolation="bilinear",
+        random_flip=False,
+    ):
         self.height = height
         self.width = width
         self.center_crop = center_crop
@@ -442,8 +460,7 @@ def __init__(
                 if any(suffix in p.name for suffix in ext):
                     self.class_images_path.append(p)
             if class_num is not None:
-                self.num_class_images = min(
-                    len(self.class_images_path), class_num)
+                self.num_class_images = min(len(self.class_images_path), class_num)
             else:
                 self.num_class_images = len(self.class_images_path)
             self._length = max(self.num_class_images, self.num_instance_images)
@@ -451,24 +468,22 @@ def __init__(
         else:
             self.class_data_root = None
 
-        self.image_transforms = transforms.Compose([
-            transforms.Resize(
-                (height, width), interpolation=interpolation),
-            transforms.CenterCrop((height, width))
-            if center_crop else transforms.RandomCrop((height, width)),
-            transforms.RandomHorizontalFlip()
-            if random_flip else Lambda(lambda x: x),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ])
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize((height, width), interpolation=interpolation),
+                transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)),
+                transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
 
     def __len__(self):
         return self._length
 
     def __getitem__(self, index):
         example = {}
-        instance_image = Image.open(self.instance_images_path[
-            index % self.num_instance_images])
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
         if not instance_image.mode == "RGB":
             instance_image = instance_image.convert("RGB")
         example["instance_images"] = self.image_transforms(instance_image)
@@ -477,11 +492,11 @@ def __getitem__(self, index):
             padding="do_not_pad",
             truncation=True,
             max_length=self.tokenizer.model_max_length,
-            return_attention_mask=False, ).input_ids
+            return_attention_mask=False,
+        ).input_ids
 
         if self.class_data_root:
-            class_image = Image.open(self.class_images_path[
-                index % self.num_class_images])
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
             if not class_image.mode == "RGB":
                 class_image = class_image.convert("RGB")
             example["class_images"] = self.image_transforms(class_image)
@@ -490,7 +505,8 @@ def __getitem__(self, index):
                 padding="do_not_pad",
                 truncation=True,
                 max_length=self.tokenizer.model_max_length,
-                return_attention_mask=False, ).input_ids
+                return_attention_mask=False,
+            ).input_ids
 
         return example
 
@@ -512,9 +528,7 @@ def __getitem__(self, index):
         return example
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -547,45 +561,43 @@ def main():
             pipeline = DiffusionPipeline.from_pretrained(
                 args.pretrained_model_name_or_path,
                 safety_checker=None,
-                requires_safety_checker=False, )
-            if (args.enable_xformers_memory_efficient_attention and
-                    is_ppxformers_available()):
+                requires_safety_checker=False,
+            )
+            if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
                 try:
                     pipeline.unet.enable_xformers_memory_efficient_attention()
                 except Exception as e:
                     logger.warn(
                         "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                        f" correctly and a GPU is available: {e}")
+                        f" correctly and a GPU is available: {e}"
+                    )
             pipeline.set_progress_bar_config(disable=True)
 
             num_new_images = args.num_class_images - cur_class_images
             logger.info(f"Number of class images to sample: {num_new_images}.")
 
             sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            batch_sampler = (DistributedBatchSampler(
-                sample_dataset,
-                batch_size=args.sample_batch_size,
-                shuffle=False) if num_processes > 1 else BatchSampler(
-                    sample_dataset,
-                    batch_size=args.sample_batch_size,
-                    shuffle=False))
+            batch_sampler = (
+                DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+                if num_processes > 1
+                else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+            )
             sample_dataloader = DataLoader(
                 sample_dataset,
                 batch_sampler=batch_sampler,
-                num_workers=args.dataloader_num_workers, )
+                num_workers=args.dataloader_num_workers,
+            )
 
             for example in tqdm(
-                    sample_dataloader,
-                    desc="Generating class images",
-                    disable=not is_main_process, ):
+                sample_dataloader,
+                desc="Generating class images",
+                disable=not is_main_process,
+            ):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
                     hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = (
-                        class_images_dir /
-                        f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    )
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
             pipeline.to("cpu")
             del pipeline
@@ -597,17 +609,14 @@ def main():
 
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_name = get_full_repo_name(
-                    Path(args.output_dir).name, token=args.hub_token)
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
 
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(
-                args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
 
-            with open(os.path.join(args.output_dir, ".gitignore"),
-                      "w+") as gitignore:
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
                     gitignore.write("step_*\n")
                 if "epoch_*" not in gitignore:
@@ -617,30 +626,26 @@ def main():
     if args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
     elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
 
     # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path)
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
 
     # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
     text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
-    text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
-                   else text_encoder.config.to_dict())
-    if (text_config.get("use_attention_mask", None) is not None and
-            text_config["use_attention_mask"]):
+        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+    )
+    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
         use_attention_mask = True
     else:
         use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_model_name_or_path,
-        subfolder="unet", )
+        subfolder="unet",
+    )
 
     freeze_params(vae.parameters())
     if not args.train_text_encoder:
@@ -650,21 +655,20 @@ def main():
         if args.train_text_encoder:
             set_recompute(text_encoder, True)
 
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
-    ):
+    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
         try:
             unet.enable_xformers_memory_efficient_attention()
         except Exception as e:
             logger.warn(
                 "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}")
+                f" correctly and a GPU is available: {e}"
+            )
 
     # Dataset and DataLoaders creation:
     train_dataset = DreamBoothDataset(
         instance_data_root=args.instance_data_dir,
         instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir
-        if args.with_prior_preservation else None,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
         class_prompt=args.class_prompt,
         class_num=args.num_class_images,
         tokenizer=tokenizer,
@@ -672,7 +676,8 @@ def main():
         width=args.width,
         center_crop=args.center_crop,
         interpolation="bilinear",
-        random_flip=args.random_flip, )
+        random_flip=args.random_flip,
+    )
 
     def collate_fn(examples):
         input_ids = [example["instance_prompt_ids"] for example in examples]
@@ -687,38 +692,35 @@ def collate_fn(examples):
         pixel_values = paddle.stack(pixel_values).astype("float32")
 
         input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
+            {"input_ids": input_ids},
             padding="max_length",
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids
+            return_tensors="pd",
+        ).input_ids
 
         return {
             "input_ids": input_ids,
             "pixel_values": pixel_values,
         }
 
-    train_sampler = (DistributedBatchSampler(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True)
-                     if num_processes > 1 else BatchSampler(
-                         train_dataset,
-                         batch_size=args.train_batch_size,
-                         shuffle=True))
+    train_sampler = (
+        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+        if num_processes > 1
+        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    )
     train_dataloader = DataLoader(
         train_dataset,
         batch_sampler=train_sampler,
         collate_fn=collate_fn,
-        num_workers=args.dataloader_num_workers, )
+        num_workers=args.dataloader_num_workers,
+    )
 
     # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps /
-                                      num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     if num_processes > 1:
         unet = paddle.DataParallel(unet)
@@ -726,23 +728,22 @@ def collate_fn(examples):
             text_encoder = paddle.DataParallel(text_encoder)
 
     params_to_optimize = (
-        list(unet.parameters()) + list(text_encoder.parameters())
-        if args.train_text_encoder else unet.parameters())
+        list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
 
     if args.scale_lr:
-        args.learning_rate = (args.learning_rate *
-                              args.gradient_accumulation_steps *
-                              args.train_batch_size * num_processes)
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+        )
 
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps *
-        args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps *
-        args.gradient_accumulation_steps,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
         num_cycles=args.lr_num_cycles,
-        power=args.lr_power, )
+        power=args.lr_power,
+    )
     # Initialize the optimizer
     optimizer = AdamW(
         learning_rate=lr_scheduler,
@@ -751,8 +752,8 @@ def collate_fn(examples):
         beta2=args.adam_beta2,
         weight_decay=args.adam_weight_decay,
         epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
-        if args.max_grad_norm > 0 else None, )
+        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+    )
 
     if is_main_process:
         logger.info("-----------  Configuration Arguments -----------")
@@ -762,25 +763,19 @@ def collate_fn(examples):
         writer = get_report_to(args)
 
     # Train!
-    total_batch_size = (args.train_batch_size * num_processes *
-                        args.gradient_accumulation_steps)
+    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not is_main_process)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
     progress_bar.set_description("Train Steps")
     global_step = 0
 
@@ -803,22 +798,24 @@ def collate_fn(examples):
             if args.noise_offset:
                 # https://www.crosslabs.org//blog/diffusion-with-offset-noise
                 noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1),
-                    dtype=latents.dtype)
+                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+                )
             batch_size = latents.shape[0]
             # Sample a random timestep for each image
             timesteps = paddle.randint(
                 0,
                 noise_scheduler.config.num_train_timesteps,
-                (batch_size, ),
-                dtype="int64", )
+                (batch_size,),
+                dtype="int64",
+            )
 
             # Add noise to the latents according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-            if num_processes > 1 and (args.gradient_checkpointing or (
-                (step + 1) % args.gradient_accumulation_steps != 0)):
+            if num_processes > 1 and (
+                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+            ):
                 # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
                 # gradient_checkpointing, no_sync every where
                 # gradient_checkpointing + grad_acc, no_sync every where
@@ -826,55 +823,45 @@ def collate_fn(examples):
                 if args.train_text_encoder:
                     text_encoder_ctx_manager = text_encoder.no_sync()
                 else:
-                    text_encoder_ctx_manager = (contextlib.nullcontext()
-                                                if sys.version_info >= (3, 7)
-                                                else contextlib.suppress())
+                    text_encoder_ctx_manager = (
+                        contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                    )
             else:
-                unet_ctx_manager = (contextlib.nullcontext()
-                                    if sys.version_info >= (3, 7) else
-                                    contextlib.suppress())
-                text_encoder_ctx_manager = (contextlib.nullcontext()
-                                            if sys.version_info >= (3, 7) else
-                                            contextlib.suppress())
+                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                text_encoder_ctx_manager = (
+                    contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                )
 
             with text_encoder_ctx_manager:
                 # Get the text embedding for conditioning
                 if use_attention_mask:
-                    attention_mask = (batch["input_ids"] !=
-                                      tokenizer.pad_token_id).cast("int64")
+                    attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
                 else:
                     attention_mask = None
-                encoder_hidden_states = text_encoder(
-                    batch["input_ids"], attention_mask=attention_mask)[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
 
                 with unet_ctx_manager:
                     # Predict the noise residual / sample
-                    model_pred = unet(noisy_latents, timesteps,
-                                      encoder_hidden_states).sample
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                     # Get the target for loss depending on the prediction type
                     if noise_scheduler.config.prediction_type == "epsilon":
                         target = noise
                     elif noise_scheduler.config.prediction_type == "v_prediction":
-                        target = noise_scheduler.get_velocity(latents, noise,
-                                                              timesteps)
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
                     else:
-                        raise ValueError(
-                            f"Unknown prediction type {noise_scheduler.config.prediction_type}"
-                        )
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                     if args.with_prior_preservation:
                         # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                        model_pred, model_pred_prior = model_pred.chunk(
-                            2, axis=0)
+                        model_pred, model_pred_prior = model_pred.chunk(2, axis=0)
                         target, target_prior = target.chunk(2, axis=0)
 
                         # Compute instance loss
                         loss = F.mse_loss(model_pred, target, reduction="mean")
 
                         # Compute prior loss
-                        prior_loss = F.mse_loss(
-                            model_pred_prior, target_prior, reduction="mean")
+                        prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean")
 
                         # Add the prior loss to the instance loss.
                         loss = loss + args.prior_loss_weight * prior_loss
@@ -908,13 +895,10 @@ def collate_fn(examples):
                         writer.add_scalar(f"train/{name}", val, global_step)
 
                     if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir,
-                                                 f"checkpoint-{global_step}")
-                        unwrap_model(unet).save_pretrained(
-                            os.path.join(save_path, "unet"))
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet"))
                         if args.train_text_encoder:
-                            unwrap_model(text_encoder).save_pretrained(
-                                os.path.join(save_path, "text_encoder"))
+                            unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder"))
 
                 if global_step >= args.max_train_steps:
                     break
@@ -926,14 +910,12 @@ def collate_fn(examples):
         pipeline = DiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
             unet=unwrap_model(unet),
-            text_encoder=unwrap_model(text_encoder), )
+            text_encoder=unwrap_model(text_encoder),
+        )
         pipeline.save_pretrained(args.output_dir)
 
         if args.push_to_hub:
-            repo.push_to_hub(
-                commit_message="End of training",
-                blocking=False,
-                auto_lfs_prune=True)
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
 
 if __name__ == "__main__":
diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
index b8837db5fb804..b36bc8b8f2130 100644
--- a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
+++ b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
@@ -32,10 +32,10 @@
 import paddle.nn.functional as F
 import requests
 from huggingface_hub import HfFolder, create_repo, upload_folder, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
-    fused_allreduce_gradients
-from paddle.io import (BatchSampler, DataLoader, Dataset,
-                       DistributedBatchSampler)
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
+from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
 from paddle.optimizer import AdamW
 from paddle.vision import BaseTransform, transforms
 from paddlenlp.trainer import set_seed
@@ -44,12 +44,21 @@
 from PIL import Image
 from tqdm.auto import tqdm
 
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
-                         DPMSolverMultistepScheduler, UNet2DConditionModel,
-                         is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
 from ppdiffusers.models.attention_processor import (
-    AttnProcessor, AttnProcessor2_5, LoRAAttnProcessor, LoRAAttnProcessor2_5)
+    AttnProcessor,
+    AttnProcessor2_5,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_5,
+)
 from ppdiffusers.optimization import get_scheduler
 from ppdiffusers.training_utils import freeze_params, unwrap_model
 from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, check_min_version
@@ -62,14 +71,14 @@
 
 
 def _retry(
-        func,
-        func_args: Optional[tuple]=None,
-        func_kwargs: Optional[dict]=None,
-        exceptions: Type[requests.exceptions.RequestException]=requests.
-        exceptions.RequestException,
-        max_retries: int=0,
-        base_wait_time: float=0.5,
-        max_wait_time: float=2, ):
+    func,
+    func_args: Optional[tuple] = None,
+    func_kwargs: Optional[dict] = None,
+    exceptions: Type[requests.exceptions.RequestException] = requests.exceptions.RequestException,
+    max_retries: int = 0,
+    base_wait_time: float = 0.5,
+    max_wait_time: float = 2,
+):
     func_args = func_args or ()
     func_kwargs = func_kwargs or {}
     retry = 0
@@ -80,27 +89,24 @@ def _retry(
             if retry >= max_retries:
                 raise err
             else:
-                sleep_time = min(max_wait_time, base_wait_time * 2
-                                 **retry)  # Exponential backoff
-                logger.info(
-                    f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]"
-                )
+                sleep_time = min(max_wait_time, base_wait_time * 2**retry)  # Exponential backoff
+                logger.info(f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]")
                 time.sleep(sleep_time)
                 retry += 1
 
 
 def url_or_path_join(*path_list):
-    return (os.path.join(*path_list)
-            if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
 
 
 def save_model_card(
-        repo_id: str,
-        images=None,
-        base_model=str,
-        train_text_encoder=False,
-        prompt=str,
-        repo_folder=None, ):
+    repo_id: str,
+    images=None,
+    base_model=str,
+    train_text_encoder=False,
+    prompt=str,
+    repo_folder=None,
+):
     img_str = ""
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -133,11 +139,11 @@ def save_model_card(
         f.write(yaml + model_card)
 
 
-def import_model_class_from_model_name_or_path(
-        pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
     try:
         text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+        )
         model_class = text_encoder_config.architectures[0]
     except Exception:
         model_class = "LDMBertModel"
@@ -146,8 +152,9 @@ def import_model_class_from_model_name_or_path(
 
         return CLIPTextModel
     elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
-            RobertaSeriesModelWithTransformation
+        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+            RobertaSeriesModelWithTransformation,
+        )
 
         return RobertaSeriesModelWithTransformation
     elif model_class == "BertModel":
@@ -155,8 +162,9 @@ def import_model_class_from_model_name_or_path(
 
         return BertModel
     elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
-            LDMBertModel
+        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+            LDMBertModel,
+        )
 
         return LDMBertModel
     else:
@@ -187,8 +195,7 @@ def get_report_to(args):
 
 
 def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training dreambooth lora script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training dreambooth lora script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -207,19 +214,22 @@ def parse_args(input_args=None):
         type=str,
         default=None,
         required=True,
-        help="A folder containing the training data of instance images.", )
+        help="A folder containing the training data of instance images.",
+    )
     parser.add_argument(
         "--class_data_dir",
         type=str,
         default=None,
         required=False,
-        help="A folder containing the training data of class images.", )
+        help="A folder containing the training data of class images.",
+    )
     parser.add_argument(
         "--instance_prompt",
         type=str,
         default=None,
         required=True,
-        help="The prompt with identifier specifying the instance", )
+        help="The prompt with identifier specifying the instance",
+    )
     parser.add_argument(
         "--class_prompt",
         type=str,
@@ -230,7 +240,8 @@ def parse_args(input_args=None):
         "--validation_prompt",
         type=str,
         default=None,
-        help="A prompt that is sampled during training for inference.", )
+        help="A prompt that is sampled during training for inference.",
+    )
     parser.add_argument(
         "--num_validation_images",
         type=int,
@@ -244,17 +255,20 @@ def parse_args(input_args=None):
         help=(
             "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
             " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--with_prior_preservation",
         default=False,
         action="store_true",
-        help="Flag to add prior preservation loss.", )
+        help="Flag to add prior preservation loss.",
+    )
     parser.add_argument(
         "--prior_loss_weight",
         type=float,
         default=1.0,
-        help="The weight of prior preservation loss.", )
+        help="The weight of prior preservation loss.",
+    )
     parser.add_argument(
         "--num_class_images",
         type=int,
@@ -262,44 +276,48 @@ def parse_args(input_args=None):
         help=(
             "Minimal class images for prior preservation loss. If there are not enough images already present in"
             " class_data_dir, additional images will be sampled with class_prompt."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
         default="lora-dreambooth-model",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--height",
         type=int,
         default=None,
         help=(
             "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"), )
+            " height"
+        ),
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
         help=(
             "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"), )
+            " width"
+        ),
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         default=512,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--lora_rank",
         type=int,
         default=4,
-        help="The rank of lora linear.", )
+        help="The rank of lora linear.",
+    )
     parser.add_argument(
         "--center_crop",
         default=False,
@@ -307,16 +325,19 @@ def parse_args(input_args=None):
         help=(
             "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
             " cropped. The images will be resized to the resolution first before cropping."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--random_flip",
         action="store_true",
-        help="whether to randomly flip images horizontally", )
+        help="whether to randomly flip images horizontally",
+    )
     parser.add_argument(
         "--train_batch_size",
         type=int,
         default=4,
-        help="Batch size (per device) for the training dataloader.", )
+        help="Batch size (per device) for the training dataloader.",
+    )
     parser.add_argument(
         "--train_text_encoder",
         action="store_true",
@@ -326,7 +347,8 @@ def parse_args(input_args=None):
         "--sample_batch_size",
         type=int,
         default=4,
-        help="Batch size (per device) for sampling images.", )
+        help="Batch size (per device) for sampling images.",
+    )
     parser.add_argument("--num_train_epochs", type=int, default=1)
     parser.add_argument(
         "--max_train_steps",
@@ -338,7 +360,8 @@ def parse_args(input_args=None):
         "--checkpointing_steps",
         type=int,
         default=500,
-        help=("Save a checkpoint of the training state every X updates."), )
+        help=("Save a checkpoint of the training state every X updates."),
+    )
     parser.add_argument(
         "--gradient_accumulation_steps",
         type=int,
@@ -368,12 +391,15 @@ def parse_args(input_args=None):
         default="constant",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'), )
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
     parser.add_argument(
         "--lr_warmup_steps",
         type=int,
         default=500,
-        help="Number of steps for the warmup in the lr scheduler.", )
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
     parser.add_argument(
         "--lr_num_cycles",
         type=int,
@@ -384,45 +410,47 @@ def parse_args(input_args=None):
         "--lr_power",
         type=float,
         default=1.0,
-        help="Power factor of the polynomial scheduler.", )
+        help="Power factor of the polynomial scheduler.",
+    )
     parser.add_argument(
         "--dataloader_num_workers",
         type=int,
         default=0,
         help=(
             "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--adam_beta1",
         type=float,
         default=0.9,
-        help="The beta1 parameter for the Adam optimizer.", )
+        help="The beta1 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_beta2",
         type=float,
         default=0.999,
-        help="The beta2 parameter for the Adam optimizer.", )
-    parser.add_argument(
-        "--adam_weight_decay",
-        type=float,
-        default=1e-2,
-        help="Weight decay to use.")
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
     parser.add_argument(
         "--adam_epsilon",
         type=float,
         default=1e-08,
-        help="Epsilon value for the Adam optimizer", )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
         "--push_to_hub",
         action="store_true",
-        help="Whether or not to push the model to the Hub.", )
+        help="Whether or not to push the model to the Hub.",
+    )
     parser.add_argument(
         "--hub_token",
         type=str,
         default=None,
-        help="The token to use to push to the Model Hub.", )
+        help="The token to use to push to the Model Hub.",
+    )
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -435,22 +463,22 @@ def parse_args(input_args=None):
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"), )
+            "*output_dir/logs"
+        ),
+    )
     parser.add_argument(
         "--report_to",
         type=str,
         default="visualdl",
         choices=["tensorboard", "visualdl"],
-        help="Log writer type.", )
+        help="Log writer type.",
+    )
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether or not to use xformers.", )
-    parser.add_argument(
-        "--noise_offset",
-        type=float,
-        default=0,
-        help="The scale of noise offset.")
+        help="Whether or not to use xformers.",
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -462,20 +490,15 @@ def parse_args(input_args=None):
 
     if args.with_prior_preservation:
         if args.class_data_dir is None:
-            raise ValueError(
-                "You must specify a data directory for class images.")
+            raise ValueError("You must specify a data directory for class images.")
         if args.class_prompt is None:
             raise ValueError("You must specify prompt for class images.")
     else:
         # logger is not available yet
         if args.class_data_dir is not None:
-            warnings.warn(
-                "You need not use --class_data_dir without --with_prior_preservation."
-            )
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
         if args.class_prompt is not None:
-            warnings.warn(
-                "You need not use --class_prompt without --with_prior_preservation."
-            )
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
 
     args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
     if args.height is None or args.width is None and args.resolution is not None:
@@ -491,18 +514,19 @@ class DreamBoothDataset(Dataset):
     """
 
     def __init__(
-            self,
-            instance_data_root,
-            instance_prompt,
-            tokenizer,
-            class_data_root=None,
-            class_prompt=None,
-            class_num=None,
-            height=512,
-            width=512,
-            center_crop=False,
-            interpolation="bilinear",
-            random_flip=False, ):
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        height=512,
+        width=512,
+        center_crop=False,
+        interpolation="bilinear",
+        random_flip=False,
+    ):
         self.height = height
         self.width = width
         self.center_crop = center_crop
@@ -528,8 +552,7 @@ def __init__(
                 if any(suffix in p.name for suffix in ext):
                     self.class_images_path.append(p)
             if class_num is not None:
-                self.num_class_images = min(
-                    len(self.class_images_path), class_num)
+                self.num_class_images = min(len(self.class_images_path), class_num)
             else:
                 self.num_class_images = len(self.class_images_path)
             self._length = max(self.num_class_images, self.num_instance_images)
@@ -537,24 +560,22 @@ def __init__(
         else:
             self.class_data_root = None
 
-        self.image_transforms = transforms.Compose([
-            transforms.Resize(
-                (height, width), interpolation=interpolation),
-            transforms.CenterCrop((height, width))
-            if center_crop else transforms.RandomCrop((height, width)),
-            transforms.RandomHorizontalFlip()
-            if random_flip else Lambda(lambda x: x),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ])
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize((height, width), interpolation=interpolation),
+                transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)),
+                transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
 
     def __len__(self):
         return self._length
 
     def __getitem__(self, index):
         example = {}
-        instance_image = Image.open(self.instance_images_path[
-            index % self.num_instance_images])
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
         if not instance_image.mode == "RGB":
             instance_image = instance_image.convert("RGB")
         example["instance_images"] = self.image_transforms(instance_image)
@@ -563,11 +584,11 @@ def __getitem__(self, index):
             padding="do_not_pad",
             truncation=True,
             max_length=self.tokenizer.model_max_length,
-            return_attention_mask=False, ).input_ids
+            return_attention_mask=False,
+        ).input_ids
 
         if self.class_data_root:
-            class_image = Image.open(self.class_images_path[
-                index % self.num_class_images])
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
             if not class_image.mode == "RGB":
                 class_image = class_image.convert("RGB")
             example["class_images"] = self.image_transforms(class_image)
@@ -576,7 +597,8 @@ def __getitem__(self, index):
                 padding="do_not_pad",
                 truncation=True,
                 max_length=self.tokenizer.model_max_length,
-                return_attention_mask=False, ).input_ids
+                return_attention_mask=False,
+            ).input_ids
 
         return example
 
@@ -598,9 +620,7 @@ def __getitem__(self, index):
         return example
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -633,45 +653,43 @@ def main():
             pipeline = DiffusionPipeline.from_pretrained(
                 args.pretrained_model_name_or_path,
                 safety_checker=None,
-                requires_safety_checker=False, )
-            if (args.enable_xformers_memory_efficient_attention and
-                    is_ppxformers_available()):
+                requires_safety_checker=False,
+            )
+            if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
                 try:
                     pipeline.unet.enable_xformers_memory_efficient_attention()
                 except Exception as e:
                     logger.warning(
                         "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                        f" correctly and a GPU is available: {e}")
+                        f" correctly and a GPU is available: {e}"
+                    )
             pipeline.set_progress_bar_config(disable=True)
 
             num_new_images = args.num_class_images - cur_class_images
             logger.info(f"Number of class images to sample: {num_new_images}.")
 
             sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            batch_sampler = (DistributedBatchSampler(
-                sample_dataset,
-                batch_size=args.sample_batch_size,
-                shuffle=False) if num_processes > 1 else BatchSampler(
-                    sample_dataset,
-                    batch_size=args.sample_batch_size,
-                    shuffle=False))
+            batch_sampler = (
+                DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+                if num_processes > 1
+                else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
+            )
             sample_dataloader = DataLoader(
                 sample_dataset,
                 batch_sampler=batch_sampler,
-                num_workers=args.dataloader_num_workers, )
+                num_workers=args.dataloader_num_workers,
+            )
 
             for example in tqdm(
-                    sample_dataloader,
-                    desc="Generating class images",
-                    disable=not is_main_process, ):
+                sample_dataloader,
+                desc="Generating class images",
+                disable=not is_main_process,
+            ):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
                     hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = (
-                        class_images_dir /
-                        f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    )
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
             pipeline.to("cpu")
             del pipeline
@@ -687,53 +705,50 @@ def main():
     elif args.pretrained_model_name_or_path:
         try:
             tokenizer = AutoTokenizer.from_pretrained(
-                url_or_path_join(args.pretrained_model_name_or_path,
-                                 "tokenizer"))
+                url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")
+            )
         except KeyError as e:
             if "XLMRobertaTokenizer" in str(e):
                 from paddlenlp.transformers import XLMRobertaTokenizer
 
                 tokenizer = XLMRobertaTokenizer.from_pretrained(
-                    url_or_path_join(args.pretrained_model_name_or_path,
-                                     "tokenizer"))
+                    url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")
+                )
             else:
                 raise e
 
     # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path)
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
 
     # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
     text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
-    text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
-                   else text_encoder.config.to_dict())
-    if (text_config.get("use_attention_mask", None) is not None and
-            text_config["use_attention_mask"]):
+        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+    )
+    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
         use_attention_mask = True
     else:
         use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_model_name_or_path,
-        subfolder="unet", )
+        subfolder="unet",
+    )
 
     # We only train the additional adapter LoRA layers
     freeze_params(vae.parameters())
     freeze_params(text_encoder.parameters())
     freeze_params(unet.parameters())
 
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
-    ):
+    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
         try:
             unet.enable_xformers_memory_efficient_attention()
         except Exception as e:
             logger.warning(
                 "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}")
+                f" correctly and a GPU is available: {e}"
+            )
     # now we will add new LoRA weights to the attention layers
     # It's important to realize here how many attention weights will be added and of which sizes
     # The sizes of the attention layers consist only of two different variables:
@@ -750,14 +765,12 @@ def main():
     # Set correct lora layers
     unet_lora_attn_procs = {}
     for name, attn_processor in unet.attn_processors.items():
-        cross_attention_dim = (None if name.endswith("attn1.processor") else
-                               unet.config.cross_attention_dim)
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = unet.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[
-                block_id]
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
@@ -767,14 +780,13 @@ def main():
         elif isinstance(attn_processor, AttnProcessor2_5):
             lora_attn_processor_class = LoRAAttnProcessor2_5
         else:
-            raise ValueError(
-                f"Unknown attention processor type: {attn_processor.__class__.__name__}"
-            )
+            raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}")
 
         unet_lora_attn_procs[name] = lora_attn_processor_class(
             hidden_size=hidden_size,
             cross_attention_dim=cross_attention_dim,
-            rank=args.lora_rank, )
+            rank=args.lora_rank,
+        )
 
     unet.set_attn_processor(unet_lora_attn_procs)
     unet_lora_layers = AttnProcsLayers(unet.attn_processors)
@@ -790,10 +802,12 @@ def main():
                 text_lora_attn_procs[name] = LoRAAttnProcessor(
                     hidden_size=module.out_proj.weight.shape[1],
                     cross_attention_dim=None,
-                    rank=args.lora_rank, )
+                    rank=args.lora_rank,
+                )
         text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
         temp_pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, text_encoder=text_encoder)
+            args.pretrained_model_name_or_path, text_encoder=text_encoder
+        )
         temp_pipeline._modify_text_encoder(text_lora_attn_procs)
         text_encoder = temp_pipeline.text_encoder
         del temp_pipeline
@@ -802,8 +816,7 @@ def main():
     train_dataset = DreamBoothDataset(
         instance_data_root=args.instance_data_dir,
         instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir
-        if args.with_prior_preservation else None,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
         class_prompt=args.class_prompt,
         class_num=args.num_class_images,
         tokenizer=tokenizer,
@@ -811,7 +824,8 @@ def main():
         width=args.width,
         center_crop=args.center_crop,
         interpolation="bilinear",
-        random_flip=args.random_flip, )
+        random_flip=args.random_flip,
+    )
 
     def collate_fn(examples):
         input_ids = [example["instance_prompt_ids"] for example in examples]
@@ -826,58 +840,55 @@ def collate_fn(examples):
         pixel_values = paddle.stack(pixel_values).astype("float32")
 
         input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
+            {"input_ids": input_ids},
             padding="max_length",
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids
+            return_tensors="pd",
+        ).input_ids
 
         return {
             "input_ids": input_ids,
             "pixel_values": pixel_values,
         }
 
-    train_sampler = (DistributedBatchSampler(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True)
-                     if num_processes > 1 else BatchSampler(
-                         train_dataset,
-                         batch_size=args.train_batch_size,
-                         shuffle=True))
+    train_sampler = (
+        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+        if num_processes > 1
+        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    )
     train_dataloader = DataLoader(
         train_dataset,
         batch_sampler=train_sampler,
         collate_fn=collate_fn,
-        num_workers=args.dataloader_num_workers, )
+        num_workers=args.dataloader_num_workers,
+    )
 
     # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps /
-                                      num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     if args.scale_lr:
-        args.learning_rate = (args.learning_rate *
-                              args.gradient_accumulation_steps *
-                              args.train_batch_size * num_processes)
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+        )
 
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps *
-        args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps *
-        args.gradient_accumulation_steps,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
         num_cycles=args.lr_num_cycles,
-        power=args.lr_power, )
+        power=args.lr_power,
+    )
 
-    params_to_optimize = (list(unet_lora_layers.parameters()) +
-                          list(text_encoder_lora_layers.parameters())
-                          if args.train_text_encoder else
-                          unet_lora_layers.parameters())
+    params_to_optimize = (
+        list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters())
+        if args.train_text_encoder
+        else unet_lora_layers.parameters()
+    )
     # Optimizer creation
     optimizer = AdamW(
         learning_rate=lr_scheduler,
@@ -886,8 +897,8 @@ def collate_fn(examples):
         beta2=args.adam_beta2,
         weight_decay=args.adam_weight_decay,
         epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
-        if args.max_grad_norm > 0 else None, )
+        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+    )
 
     if num_processes > 1:
         unet = paddle.DataParallel(unet)
@@ -902,25 +913,19 @@ def collate_fn(examples):
         writer = get_report_to(args)
 
     # Train!
-    total_batch_size = (args.train_batch_size * num_processes *
-                        args.gradient_accumulation_steps)
+    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not is_main_process)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
     progress_bar.set_description("Train Steps")
     global_step = 0
     vae.eval()
@@ -941,52 +946,43 @@ def collate_fn(examples):
             if args.noise_offset:
                 # https://www.crosslabs.org//blog/diffusion-with-offset-noise
                 noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1),
-                    dtype=latents.dtype)
+                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+                )
             batch_size = latents.shape[0]
             # Sample a random timestep for each image
-            timesteps = paddle.randint(
-                0, noise_scheduler.config.num_train_timesteps,
-                (batch_size, )).cast("int64")
+            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
 
             # Add noise to the latents according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-            if num_processes > 1 and (args.gradient_checkpointing or (
-                (step + 1) % args.gradient_accumulation_steps != 0)):
+            if num_processes > 1 and (
+                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+            ):
                 # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
                 # gradient_checkpointing, no_sync every where
                 # gradient_checkpointing + grad_acc, no_sync every where
                 unet_ctx_manager = unet.no_sync()
             else:
-                unet_ctx_manager = (contextlib.nullcontext()
-                                    if sys.version_info >= (3, 7) else
-                                    contextlib.suppress())
+                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
             if use_attention_mask:
-                attention_mask = (
-                    batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
+                attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
             else:
                 attention_mask = None
-            encoder_hidden_states = text_encoder(
-                batch["input_ids"], attention_mask=attention_mask)[0]
+            encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
 
             with unet_ctx_manager:
                 # Predict the noise residual / sample
-                model_pred = unet(noisy_latents, timesteps,
-                                  encoder_hidden_states).sample
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
                 elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise,
-                                                          timesteps)
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
-                    raise ValueError(
-                        f"Unknown prediction type {noise_scheduler.config.prediction_type}"
-                    )
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                 if args.with_prior_preservation:
                     # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
@@ -997,8 +993,7 @@ def collate_fn(examples):
                     loss = F.mse_loss(model_pred, target, reduction="mean")
 
                     # Compute prior loss
-                    prior_loss = F.mse_loss(
-                        model_pred_prior, target_prior, reduction="mean")
+                    prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean")
 
                     # Add the prior loss to the instance loss.
                     loss = loss + args.prior_loss_weight * prior_loss
@@ -1032,54 +1027,52 @@ def collate_fn(examples):
                         writer.add_scalar(f"train/{name}", val, global_step)
 
                     if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir,
-                                                 f"checkpoint-{global_step}")
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         # We combine the text encoder and UNet LoRA parameters with a simple
                         # custom logic. So, use `LoraLoaderMixin.save_lora_weights()`.
                         LoraLoaderMixin.save_lora_weights(
                             save_directory=save_path,
                             unet_lora_layers=unet_lora_layers,
-                            text_encoder_lora_layers=text_encoder_lora_layers, )
+                            text_encoder_lora_layers=text_encoder_lora_layers,
+                        )
                         logger.info(f"Saved lora weights to {save_path}")
 
                 if global_step >= args.max_train_steps:
                     break
 
         if is_main_process:
-            if (args.validation_prompt is not None and
-                    epoch % args.validation_epochs == 0):
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
                 logger.info(
                     f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}.")
+                    f" {args.validation_prompt}."
+                )
                 # create pipeline
                 pipeline = DiffusionPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
                     unet=unwrap_model(unet),
                     text_encoder=unwrap_model(text_encoder),
                     safety_checker=None,
-                    requires_safety_checker=False, )
-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-                    pipeline.scheduler.config)
+                    requires_safety_checker=False,
+                )
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
                 pipeline.set_progress_bar_config(disable=True)
 
                 # run inference
-                generator = (paddle.Generator().manual_seed(args.seed)
-                             if args.seed else None)
+                generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
                 images = [
                     pipeline(
                         args.validation_prompt,
                         num_inference_steps=25,
-                        generator=generator, ).images[0]
+                        generator=generator,
+                    ).images[0]
                     for _ in range(args.num_validation_images)
                 ]
                 np_images = np.stack([np.asarray(img) for img in images])
 
                 if args.report_to == "tensorboard":
-                    writer.add_images(
-                        "test", np_images, epoch, dataformats="NHWC")
+                    writer.add_images("test", np_images, epoch, dataformats="NHWC")
                 else:
-                    writer.add_image(
-                        "test", np_images, epoch, dataformats="NHWC")
+                    writer.add_image("test", np_images, epoch, dataformats="NHWC")
 
                 del pipeline
                 if args.train_text_encoder:
@@ -1092,28 +1085,25 @@ def collate_fn(examples):
         LoraLoaderMixin.save_lora_weights(
             save_directory=args.output_dir,
             unet_lora_layers=unet_lora_layers,
-            text_encoder_lora_layers=text_encoder_lora_layers, )
+            text_encoder_lora_layers=text_encoder_lora_layers,
+        )
 
         # Final inference
         # Load previous pipeline
         pipeline = DiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
             safety_checker=None,
-            requires_safety_checker=False, )
-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-            pipeline.scheduler.config)
+            requires_safety_checker=False,
+        )
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
         # load attention processors
         pipeline.load_lora_weights(args.output_dir)
 
         # run inference
         if args.validation_prompt and args.num_validation_images > 0:
-            generator = paddle.Generator().manual_seed(
-                args.seed) if args.seed else None
+            generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
             images = [
-                pipeline(
-                    args.validation_prompt,
-                    num_inference_steps=25,
-                    generator=generator).images[0]
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
                 for _ in range(args.num_validation_images)
             ]
             np_images = np.stack([np.asarray(img) for img in images])
@@ -1128,8 +1118,7 @@ def collate_fn(examples):
         # logic to push to HF Hub
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_id = get_full_repo_name(
-                    Path(args.output_dir).name, token=args.hub_token)
+                repo_id = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_id = args.hub_model_id
 
@@ -1142,14 +1131,16 @@ def collate_fn(examples):
                 },
                 base_wait_time=1.0,
                 max_retries=5,
-                max_wait_time=10.0, )
+                max_wait_time=10.0,
+            )
 
             save_model_card(
                 repo_id,
                 images=images,
                 base_model=args.pretrained_model_name_or_path,
                 prompt=args.instance_prompt,
-                repo_folder=args.output_dir, )
+                repo_folder=args.output_dir,
+            )
             # Upload model
             logger.info(f"Pushing to {repo_id}")
             _retry(
@@ -1164,7 +1155,8 @@ def collate_fn(examples):
                 },
                 base_wait_time=1.0,
                 max_retries=5,
-                max_wait_time=20.0, )
+                max_wait_time=20.0,
+            )
 
 
 if __name__ == "__main__":
diff --git a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
index 8da60623e57c1..fb7a20763c805 100644
--- a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
+++ b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
@@ -19,13 +19,9 @@
 image = load_image(url)
 text = "a red car in the sun"
 
-pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
-    "shi-labs/versatile-diffusion")
+pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
 pipe.remove_unused_weights()
 
 text_to_image_strength = 0.75
-image = pipe(
-    prompt=text, image=image,
-    text_to_image_strength=text_to_image_strength).images[0]
-image.save(
-    "dual_text_and_image_guided_generation-versatile_diffusion-result.png")
+image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
+image.save("dual_text_and_image_guided_generation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
index 59d805fed60f1..99812e2bd2122 100644
--- a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
+++ b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
@@ -29,7 +29,5 @@
 
 # 使用fp16加快生成速度
 with paddle.amp.auto_cast(True):
-    image = pipe(
-        image=init_image, mask_image=mask_image,
-        example_image=example_image).images[0]
+    image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
 image.save("image_guided_image_inpainting-paint_by_example-result.png")
diff --git a/ppdiffusers/examples/inference/image_inpainting-repaint.py b/ppdiffusers/examples/inference/image_inpainting-repaint.py
index 4e3cf9d1270c2..3d4a971fd734b 100644
--- a/ppdiffusers/examples/inference/image_inpainting-repaint.py
+++ b/ppdiffusers/examples/inference/image_inpainting-repaint.py
@@ -15,19 +15,15 @@
 from ppdiffusers import RePaintPipeline, RePaintScheduler
 from ppdiffusers.utils import load_image
 
-img_url = (
-    "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
-)
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
 mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
 
 # Load the original image and the mask as PIL images
 original_image = load_image(img_url).resize((256, 256))
 mask_image = load_image(mask_url).resize((256, 256))
 
-scheduler = RePaintScheduler.from_pretrained(
-    "google/ddpm-ema-celebahq-256", subfolder="scheduler")
-pipe = RePaintPipeline.from_pretrained(
-    "google/ddpm-ema-celebahq-256", scheduler=scheduler)
+scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
+pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
 
 output = pipe(
     image=original_image,
@@ -35,7 +31,8 @@
     num_inference_steps=250,
     eta=0.0,
     jump_length=10,
-    jump_n_sample=10, )
+    jump_n_sample=10,
+)
 inpainted_image = output.images[0]
 
 inpainted_image.save("image_inpainting-repaint-result.png")
diff --git a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
index 4889b99839ad0..ea5294247238b 100644
--- a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
@@ -28,17 +28,16 @@ def download_image(url):
 
 
 # Loading additional models
-feature_extractor = CLIPFeatureExtractor.from_pretrained(
-    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
-clip_model = CLIPModel.from_pretrained(
-    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16)
+feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16)
 
 mixing_pipeline = DiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
     custom_pipeline="clip_guided_images_mixing_stable_diffusion",
     clip_model=clip_model,
     feature_extractor=feature_extractor,
-    paddle_dtype=paddle.float16, )
+    paddle_dtype=paddle.float16,
+)
 mixing_pipeline.enable_attention_slicing()
 
 # Pipline running
@@ -64,6 +63,7 @@ def download_image(url):
     guidance_scale=9.0,
     batch_size=1,
     clip_guidance_scale=100,
-    generator=generator, ).images
+    generator=generator,
+).images
 
 pipe_images[0].save("clip_guided_images_mixing_stable_diffusion.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
index 537dc6cf71437..1525fc680c2c2 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
@@ -26,8 +26,6 @@
 prompt = "奇幻的景观，以一种艺术的形式。"
 # 使用fp16加快生成速度
 with paddle.amp.auto_cast(True):
-    image = pipe(
-        prompt=prompt, image=init_image, strength=0.75,
-        guidance_scale=7.5).images[0]
+    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
 
 image.save("image_to_image_text_guided_generation-alt_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
index d1cf291ca57f0..b1d9267b2ac0d 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
@@ -19,9 +19,8 @@
 
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    safety_checker=None,
-    controlnet=controlnet)
+    "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+)
 pipe.set_progress_bar_config(disable=None)
 
 generator = paddle.Generator().manual_seed(0)
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
index 5f106c65341f3..bdd71eb35c00d 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
@@ -24,7 +24,8 @@
 pipe = IFImg2ImgPipeline.from_pretrained(
     "DeepFloyd/IF-I-XL-v1.0",
     variant="fp16",
-    paddle_dtype=paddle.float16, )
+    paddle_dtype=paddle.float16,
+)
 pipe.enable_xformers_memory_efficient_attention()
 prompt = "A fantasy landscape in style minecraft"
 prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
@@ -33,25 +34,26 @@
     image=original_image,
     prompt_embeds=prompt_embeds,
     negative_prompt_embeds=negative_embeds,
-    output_type="pd", ).images
+    output_type="pd",
+).images
 pipe.to(paddle_device="cpu")
 
 # save intermediate image
 pil_image = pd_to_pil(image)
-pil_image[0].save(
-    "./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png")
+pil_image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png")
 
 super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
     "DeepFloyd/IF-II-L-v1.0",
     text_encoder=None,
     variant="fp16",
-    paddle_dtype=paddle.float16, )
+    paddle_dtype=paddle.float16,
+)
 super_res_1_pipe.enable_xformers_memory_efficient_attention()
 
 image = super_res_1_pipe(
     image=image,
     original_image=original_image,
     prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds, ).images
-image[0].save(
-    "./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png")
+    negative_prompt_embeds=negative_embeds,
+).images
+image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
index 8de116547d619..5b2d857d58b4a 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
@@ -18,8 +18,7 @@
 from ppdiffusers.utils import load_image
 
 # 加载pipeline
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5")
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
 # 下载初始图片
 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
@@ -29,8 +28,6 @@
 prompt = "A fantasy landscape, trending on artstation"
 # 使用fp16加快生成速度
 with paddle.amp.auto_cast(True):
-    image = pipe(
-        prompt=prompt, image=init_image, strength=0.75,
-        guidance_scale=7.5).images[0]
+    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
 
 image.save("image_to_image_text_guided_generation-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
index 6103f2a54a722..67472607587b3 100644
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
+++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
@@ -17,8 +17,7 @@
 from ppdiffusers import StableDiffusionImg2ImgPipeline
 from ppdiffusers.utils import load_image
 
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2")
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2")
 
 # 下载初始图片
 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
@@ -28,9 +27,6 @@
 prompt = "A fantasy landscape, trending on artstation"
 # 使用fp16加快生成速度
 with paddle.amp.auto_cast(True):
-    image = pipe(
-        prompt=prompt, image=init_image, strength=0.75,
-        guidance_scale=7.5).images[0]
+    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
 
-image.save(
-    "image_to_image_text_guided_generation-stable_diffusion_2-result.png")
+image.save("image_to_image_text_guided_generation-stable_diffusion_2-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
index 1f7dc26f085bc..1c7678b55930c 100644
--- a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
+++ b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
@@ -16,8 +16,7 @@
 from ppdiffusers.utils import load_image
 
 pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-image = load_image(
-    "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
 result = pipe(mode="i2t", image=image, prompt=None)
 text = result.texts[0]
 with open("image_to_text_generation-unidiffuser-result.txt", "w") as f:
diff --git a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
index 3d03fdb457501..a8478035c8c87 100644
--- a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
@@ -21,19 +21,21 @@
     "lambdalabs/sd-image-variations-diffusers",
     revision="v2.0",
     from_diffusers=True,
-    from_hf_hub=True, )
+    from_hf_hub=True,
+)
 
-im = load_image(
-    "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+im = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
 
-tform = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Resize(
-        (224, 224),
-        interpolation="bicubic", ),
-    transforms.Normalize([0.48145466, 0.4578275, 0.40821073],
-                         [0.26862954, 0.26130258, 0.27577711]),
-])
+tform = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Resize(
+            (224, 224),
+            interpolation="bicubic",
+        ),
+        transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]),
+    ]
+)
 inp = tform(im)
 
 out = sd_pipe(im, guidance_scale=3)
diff --git a/ppdiffusers/examples/inference/image_variation-unidiffuser.py b/ppdiffusers/examples/inference/image_variation-unidiffuser.py
index d2bd06a9c5ec0..c334c673ff288 100644
--- a/ppdiffusers/examples/inference/image_variation-unidiffuser.py
+++ b/ppdiffusers/examples/inference/image_variation-unidiffuser.py
@@ -16,8 +16,7 @@
 from ppdiffusers.utils import load_image
 
 pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-image = load_image(
-    "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
 result = pipe(mode="i2t2i", image=image, prompt=None)
 image = result.images[0]
 image.save("image_variation-unidiffuser-result.png")
diff --git a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
index 08c7fbfb6c409..3b2ec2596cbcb 100644
--- a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
+++ b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
@@ -18,8 +18,7 @@
 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
 image = load_image(url)
 
-pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
-    "shi-labs/versatile-diffusion")
+pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
 
 image = pipe(image).images[0]
 image.save("image_variation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
index 79f9528c1a741..a986de034bc05 100644
--- a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
+++ b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
@@ -18,8 +18,7 @@
 from ppdiffusers.utils import load_image
 
 # 加载pipeline
-pipe = LDMSuperResolutionPipeline.from_pretrained(
-    "CompVis/ldm-super-resolution-4x-openimages")
+pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
 
 # 下载初始图片
 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
diff --git a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
index b5d317b5abcce..b6b29f140e86d 100644
--- a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
+++ b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
@@ -16,8 +16,7 @@
 
 from ppdiffusers import SemanticStableDiffusionPipeline
 
-pipe = SemanticStableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5")
+pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe.set_progress_bar_config(disable=None)
 prompt = "a photo of a cat"
 edit = {
@@ -38,6 +37,7 @@
     guidance_scale=guidance_scale,
     num_inference_steps=50,
     width=512,
-    height=512, )
+    height=512,
+)
 image = output.images[0]
 image.save("text_guided_generation-semantic_stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
index 013eeec9b316f..26115f88d6506 100644
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
+++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
@@ -14,8 +14,7 @@
 
 import paddle
 
-from ppdiffusers import (IFInpaintingPipeline,
-                         IFInpaintingSuperResolutionPipeline)
+from ppdiffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline
 from ppdiffusers.utils import load_image, pd_to_pil
 
 url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
@@ -24,8 +23,7 @@
 url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
 mask_image = load_image(url)
 
-pipe = IFInpaintingPipeline.from_pretrained(
-    "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
+pipe = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
 pipe.enable_xformers_memory_efficient_attention()
 prompt = "blue sunglasses"
 prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
@@ -35,7 +33,8 @@
     mask_image=mask_image,
     prompt_embeds=prompt_embeds,
     negative_prompt_embeds=negative_embeds,
-    output_type="pd", ).images
+    output_type="pd",
+).images
 pipe.to(paddle_device="cpu")
 # save intermediate image
 pil_image = pd_to_pil(image)
@@ -45,7 +44,8 @@
     "DeepFloyd/IF-II-L-v1.0",
     text_encoder=None,
     variant="fp16",
-    paddle_dtype=paddle.float16, )
+    paddle_dtype=paddle.float16,
+)
 super_res_1_pipe.enable_xformers_memory_efficient_attention()
 
 image = super_res_1_pipe(
@@ -53,5 +53,6 @@
     mask_image=mask_image,
     original_image=original_image,
     prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds, ).images
+    negative_prompt_embeds=negative_embeds,
+).images
 image[0].save("./text_guided_image_inpainting-deepfloyd_if-if_stage_II.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
index dd2dde2fe504c..0fdfe1946a84f 100644
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
@@ -23,13 +23,10 @@
 init_image = load_image(img_url).resize((512, 512))
 mask_image = load_image(mask_url).resize((512, 512))
 
-pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-    "runwayml/stable-diffusion-v1-5")
+pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
 
 prompt = "a cat sitting on a bench"
 with paddle.amp.auto_cast(True):
-    image = pipe(
-        prompt=prompt, image=init_image, mask_image=mask_image,
-        strength=0.75).images[0]
+    image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images[0]
 
 image.save("text_guided_image_inpainting-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
index c89ecf9f8de59..6b27f9a60cf88 100644
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
+++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
@@ -21,8 +21,7 @@
 init_image = load_image(img_url).resize((512, 512))
 mask_image = load_image(mask_url).resize((512, 512))
 
-pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-inpainting")
+pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting")
 
 prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
 # image and mask_image should be PIL images.
diff --git a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
index 736b2a2d09f37..de2298e710d3c 100644
--- a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
+++ b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
@@ -15,8 +15,7 @@
 from ppdiffusers import StableDiffusionUpscalePipeline
 from ppdiffusers.utils import load_image
 
-pipe = StableDiffusionUpscalePipeline.from_pretrained(
-    "stabilityai/stable-diffusion-x4-upscaler")
+pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
 
 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
 low_res_img = load_image(url).resize((128, 128))
diff --git a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
index 511f0f55ac93b..2b4c1b1330a97 100644
--- a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
+++ b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
@@ -18,8 +18,7 @@
 
 from ppdiffusers import AudioLDMPipeline
 
-pipe = AudioLDMPipeline.from_pretrained(
-    "cvssp/audioldm", paddle_dtype=paddle.float16)
+pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", paddle_dtype=paddle.float16)
 
 prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
 audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
index f6863fe8f4f8c..fccaff284995e 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
@@ -14,10 +14,8 @@
 
 from ppdiffusers import AltDiffusionPipeline, DPMSolverMultistepScheduler
 
-scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    "BAAI/AltDiffusion", subfolder="scheduler")
-pipe = AltDiffusionPipeline.from_pretrained(
-    "BAAI/AltDiffusion", scheduler=scheduler)
+scheduler = DPMSolverMultistepScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
+pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler)
 
 prompt = "黑暗精灵公主，非常详细，幻想，非常详细，数字绘画，概念艺术，敏锐的焦点，插图"
 # or in English:
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
index a016bbfbe1019..9b420b5aa57ba 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
@@ -21,15 +21,13 @@
 controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
 
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    safety_checker=None)
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
+)
 
 resolution = 512
 image = np.array(
-    load_image(
-        "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
-    ))
+    load_image("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png")
+)
 image = cv2.Canny(image, 100, 200)
 image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
@@ -43,5 +41,6 @@
     num_inference_steps=50,
     height=resolution,
     width=resolution,
-    controlnet_conditioning_scale=1.0, ).images[0]
+    controlnet_conditioning_scale=1.0,
+).images[0]
 image.save("text_to_image_generation-controlnet-result-bird_canny.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
index b060557c4a7cb..f55ded139341f 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
@@ -14,20 +14,19 @@
 
 import paddle
 
-from ppdiffusers import (DiffusionPipeline, IFPipeline,
-                         IFSuperResolutionPipeline)
+from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
 from ppdiffusers.utils import pd_to_pil
 
 # Stage 1: generate images
-pipe = IFPipeline.from_pretrained(
-    "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
+pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
 pipe.enable_xformers_memory_efficient_attention()
 prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
 prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
 image = pipe(
     prompt_embeds=prompt_embeds,
     negative_prompt_embeds=negative_embeds,
-    output_type="pd", ).images
+    output_type="pd",
+).images
 
 # save intermediate image
 pil_image = pd_to_pil(image)
@@ -40,27 +39,30 @@
     "DeepFloyd/IF-II-L-v1.0",
     text_encoder=None,
     variant="fp16",
-    paddle_dtype=paddle.float16, )
+    paddle_dtype=paddle.float16,
+)
 super_res_1_pipe.enable_xformers_memory_efficient_attention()
 
 image = super_res_1_pipe(
     image=image,
     prompt_embeds=prompt_embeds,
     negative_prompt_embeds=negative_embeds,
-    output_type="pd", ).images
+    output_type="pd",
+).images
 # save intermediate image
 pil_image = pd_to_pil(image)
-pil_image[0].save(
-    "text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
+pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
 # save gpu memory
 super_res_1_pipe.to(paddle_device="cpu")
 
 # Stage 3: super resolution stage2
 super_res_2_pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16)
+    "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16
+)
 super_res_2_pipe.enable_xformers_memory_efficient_attention()
 
 image = super_res_2_pipe(
     prompt=prompt,
-    image=image, ).images
+    image=image,
+).images
 image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_III.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
index 89ebf5ee3570d..4a71ac1a6b273 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
@@ -15,10 +15,8 @@
 from ppdiffusers import StableDiffusionPipelineSafe
 from ppdiffusers.pipelines.stable_diffusion_safe import SafetyConfig
 
-pipe = StableDiffusionPipelineSafe.from_pretrained(
-    "runwayml/stable-diffusion-v1-5")
+pipe = StableDiffusionPipelineSafe.from_pretrained("runwayml/stable-diffusion-v1-5")
 print(pipe.safety_concept)
 prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
 out = pipe(prompt=prompt, **SafetyConfig.MAX)
-out.images[0].save(
-    "text_to_image_generation-stable_diffusion_safe-result.png.png")
+out.images[0].save("text_to_image_generation-stable_diffusion_safe-result.png.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
index 0c00344a7f602..0d0ef4e6ce819 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
@@ -16,21 +16,20 @@
 from ppdiffusers import StableDiffusionAdapterPipeline, T2IAdapter
 from ppdiffusers.utils import PIL_INTERPOLATION, load_image
 
-input_image = load_image(
-    "https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png"
-)
+input_image = load_image("https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png")
 color_palette = input_image.resize((8, 8))
-color_palette = color_palette.resize(
-    (512, 512), resample=PIL_INTERPOLATION["nearest"])
+color_palette = color_palette.resize((512, 512), resample=PIL_INTERPOLATION["nearest"])
 
 adapter = T2IAdapter.from_pretrained("westfish/sd-v1-4-adapter-color")
 
 pipe = StableDiffusionAdapterPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
     adapter=adapter,
-    paddle_dtype=paddle.float16, )
+    paddle_dtype=paddle.float16,
+)
 
 image = pipe(
     prompt="At night, glowing cubes in front of the beach",
-    image=color_palette, ).images[0]
+    image=color_palette,
+).images[0]
 image.save("text_to_image_generation-t2i-adapter-result-color_adapter.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
index db8d5261d101a..d777a8ce31db3 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
@@ -14,8 +14,7 @@
 
 from ppdiffusers import VersatileDiffusionTextToImagePipeline
 
-pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-    "shi-labs/versatile-diffusion")
+pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
 pipe.remove_unused_weights()
 
 image = pipe("an astronaut riding on a horse on mars").images[0]
diff --git a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
index cb4171be41abc..fd93408658d48 100644
--- a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
+++ b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
@@ -19,25 +19,30 @@
     beta_start=0.00085,
     beta_end=0.012,
     beta_schedule="scaled_linear",
-    num_train_timesteps=1000, )
+    num_train_timesteps=1000,
+)
 pipeline = DiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
     scheduler=scheduler,
-    custom_pipeline="mixture_tiling.py", )
+    custom_pipeline="mixture_tiling.py",
+)
 pipeline
 
 # Mixture of Diffusers generation
 image = pipeline(
-    prompt=[[
-        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-    ]],
+    prompt=[
+        [
+            "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+            "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+            "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+        ]
+    ],
     tile_height=640,
     tile_width=640,
     tile_row_overlap=0,
     tile_col_overlap=256,
     guidance_scale=8,
     seed=7178915308,
-    num_inference_steps=50, )["images"][0]
+    num_inference_steps=50,
+)["images"][0]
 image.save("mixture_tiling" + ".png")
diff --git a/ppdiffusers/examples/inference/text_to_video_generation-synth.py b/ppdiffusers/examples/inference/text_to_video_generation-synth.py
index 9fd346c0f5bc1..e197cb41f426d 100644
--- a/ppdiffusers/examples/inference/text_to_video_generation-synth.py
+++ b/ppdiffusers/examples/inference/text_to_video_generation-synth.py
@@ -24,4 +24,5 @@
 imageio.mimsave(
     "text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4",
     video_frames,
-    fps=8, )
+    fps=8,
+)
diff --git a/ppdiffusers/examples/inference/text_to_video_generation-zero.py b/ppdiffusers/examples/inference/text_to_video_generation-zero.py
index b26103c3f32e2..0e4efb3563d50 100644
--- a/ppdiffusers/examples/inference/text_to_video_generation-zero.py
+++ b/ppdiffusers/examples/inference/text_to_video_generation-zero.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 import imageio
+
 # pip install imageio[ffmpeg]
 import paddle
 
 from ppdiffusers import TextToVideoZeroPipeline
 
 model_id = "runwayml/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(
-    model_id, paddle_dtype=paddle.float16)
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
 
 prompt = "A panda is playing guitar on times square"
 result = pipe(prompt=prompt).images
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
index 92557d8d6e2f4..e1914bab67daa 100644
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
+++ b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
@@ -18,8 +18,7 @@
 from ppdiffusers import AudioDiffusionPipeline
 
 # 加载模型和scheduler
-pipe = AudioDiffusionPipeline.from_pretrained(
-    "teticio/audio-diffusion-ddim-256")
+pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
 pipe.set_progress_bar_config(disable=None)
 generator = paddle.Generator().manual_seed(42)
 
@@ -29,8 +28,7 @@
 
 # 保存音频到本地
 for i, audio in enumerate(audio):
-    write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate,
-          audio.transpose())
+    write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate, audio.transpose())
 
 # 保存图片
 image.save("unconditional_audio_generation-audio_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
index 051f61f892230..9114555e75a38 100644
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
+++ b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
@@ -27,4 +27,5 @@
     write(
         f"unconditional_audio_generation-dance_diffusion-result_{i}.wav",
         pipe.unet.sample_rate,
-        audio.transpose(), )
+        audio.transpose(),
+    )
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
index d498dfbd88225..fe99d89347981 100644
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
+++ b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
@@ -22,9 +22,9 @@
 # Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
 mid_file_path = ppdiffusers_url_download(
     "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid",
-    cache_dir=".", )
-pipe = SpectrogramDiffusionPipeline.from_pretrained(
-    "google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
+    cache_dir=".",
+)
+pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
 processor = MidiProcessor()
 output = pipe(processor(mid_file_path))
 audio = output.audios[0]
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
index 2e22c143e2271..90f93ac299ed4 100644
--- a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
+++ b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
@@ -16,8 +16,7 @@
 
 scheduler = KarrasVeScheduler()
 # 加载模型和scheduler
-pipe = KarrasVePipeline.from_pretrained(
-    "google/ncsnpp-celebahq-256", scheduler=scheduler)
+pipe = KarrasVePipeline.from_pretrained("google/ncsnpp-celebahq-256", scheduler=scheduler)
 
 # 执行pipeline进行推理
 image = pipe().images
diff --git a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
index fec274338d9ad..38aed057ce167 100644
--- a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
+++ b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
@@ -19,6 +19,5 @@
 image = result.images[0]
 image.save("unconditional_image_text_generation-unidiffuser-result.png")
 text = result.texts[0]
-with open("unconditional_image_text_generation-unidiffuser-result.txt",
-          "w") as f:
+with open("unconditional_image_text_generation-unidiffuser-result.txt", "w") as f:
     print("{}\n".format(text), file=f)
diff --git a/ppdiffusers/examples/stable_diffusion/generate_images.py b/ppdiffusers/examples/stable_diffusion/generate_images.py
index e20424e75e4ee..933fd0b771040 100644
--- a/ppdiffusers/examples/stable_diffusion/generate_images.py
+++ b/ppdiffusers/examples/stable_diffusion/generate_images.py
@@ -22,9 +22,14 @@
 from paddlenlp.transformers import CLIPTextModel
 from tqdm.auto import tqdm
 
-from ppdiffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler,
-                         LMSDiscreteScheduler, PNDMScheduler,
-                         StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import DOWNLOAD_SERVER, PPDIFFUSERS_CACHE
 
 base_url = DOWNLOAD_SERVER + "/CompVis/data/"
@@ -43,32 +48,30 @@ def batchify(data, batch_size=16):
 
 
 def generate_images(
-        unet_model_name_or_path,
-        text_encoder_model_name_or_path=None,
-        batch_size=16,
-        file="coco30k.csv",
-        save_path="output",
-        seed=42,
-        scheduler_type="ddim",
-        eta=0.0,
-        num_inference_steps=50,
-        guidance_scales=[3, 4, 5, 6, 7, 8],
-        height=256,
-        width=256,
-        device="gpu",
-        variant="bf16", ):
+    unet_model_name_or_path,
+    text_encoder_model_name_or_path=None,
+    batch_size=16,
+    file="coco30k.csv",
+    save_path="output",
+    seed=42,
+    scheduler_type="ddim",
+    eta=0.0,
+    num_inference_steps=50,
+    guidance_scales=[3, 4, 5, 6, 7, 8],
+    height=256,
+    width=256,
+    device="gpu",
+    variant="bf16",
+):
     paddle.set_device(device)
     if variant == "fp32":
         variant = None
-    unet = UNet2DConditionModel.from_pretrained(
-        unet_model_name_or_path, variant=variant)
+    unet = UNet2DConditionModel.from_pretrained(unet_model_name_or_path, variant=variant)
     kwargs = {"safety_checker": None, "unet": unet}
     if text_encoder_model_name_or_path is not None:
-        text_encoder = CLIPTextModel.from_pretrained(
-            text_encoder_model_name_or_path, variant=variant)
+        text_encoder = CLIPTextModel.from_pretrained(text_encoder_model_name_or_path, variant=variant)
         kwargs["text_encoder"] = text_encoder
-    pipe = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4", **kwargs)
+    pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", **kwargs)
     pipe.set_progress_bar_config(disable=True)
     beta_start = pipe.scheduler.beta_start
     beta_end = pipe.scheduler.beta_end
@@ -80,17 +83,14 @@ def generate_images(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif scheduler_type == "euler-ancestral":
         scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
     elif scheduler_type == "ddim":
         scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -99,7 +99,8 @@ def generate_images(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
     pipe.scheduler = scheduler
@@ -122,7 +123,8 @@ def generate_images(
                 eta=eta,
                 height=height,
                 width=width,
-                num_inference_steps=num_inference_steps, )[0]
+                num_inference_steps=num_inference_steps,
+            )[0]
             for image in images:
                 path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
                 image.save(path)
@@ -136,28 +138,33 @@ def generate_images(
         default=None,
         type=str,
         required=True,
-        help="unet_model_name_or_path.", )
+        help="unet_model_name_or_path.",
+    )
     parser.add_argument(
         "--text_encoder_model_name_or_path",
         default=None,
         type=str,
-        help="text_encoder_model_name_or_path.", )
+        help="text_encoder_model_name_or_path.",
+    )
     parser.add_argument(
         "--file",
         default="coco30k",
         type=str,
-        help="eval file.", )
+        help="eval file.",
+    )
     parser.add_argument(
         "--variant",
         default="fp32",
         type=str,
         choices=["fp32", "bf16"],
-        help="eval file.", )
+        help="eval file.",
+    )
     parser.add_argument(
         "--seed",
         default=42,
         type=int,
-        help="random seed.", )
+        help="random seed.",
+    )
     parser.add_argument(
         "--scheduler_type",
         default="ddim",
@@ -167,22 +174,15 @@ def generate_images(
     )
     parser.add_argument("--device", default="gpu", type=str, help="device")
     parser.add_argument("--batch_size", default=16, type=int, help="batch_size")
-    parser.add_argument(
-        "--num_inference_steps",
-        default=50,
-        type=int,
-        help="num_inference_steps")
-    parser.add_argument(
-        "--save_path",
-        default="outputs",
-        type=str,
-        help="Path to the output file.")
+    parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps")
+    parser.add_argument("--save_path", default="outputs", type=str, help="Path to the output file.")
     parser.add_argument(
         "--guidance_scales",
         default=[1.5, 2, 3, 4, 5, 6, 7, 8],
         nargs="+",
         type=str,
-        help="guidance_scales list.", )
+        help="guidance_scales list.",
+    )
     parser.add_argument("--height", default=256, type=int, help="height.")
     parser.add_argument("--width", default=256, type=int, help="width.")
     args = parser.parse_args()
@@ -210,4 +210,5 @@ def generate_images(
         height=args.height,
         width=args.width,
         device=args.device,
-        variant=args.variant, )
+        variant=args.variant,
+    )
diff --git a/ppdiffusers/examples/stable_diffusion/sd/model.py b/ppdiffusers/examples/stable_diffusion/sd/model.py
index 449a74df28ff4..bd0df892a83b1 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/model.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/model.py
@@ -21,8 +21,13 @@
 from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
 from paddlenlp.utils.log import logger
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         UNet2DConditionModel, is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.initializer import reset_initialized_parameter, zeros_
 from ppdiffusers.models.attention import AttentionBlock
 from ppdiffusers.models.ema import LitEma
@@ -37,30 +42,31 @@ def __init__(self, model_args):
         self.model_args = model_args
         tokenizer_name_or_path = (
             model_args.tokenizer_name
-            if model_args.tokenizer_name is not None else
-            os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+            if model_args.tokenizer_name is not None
+            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+        )
         vae_name_or_path = (
             model_args.vae_name_or_path
-            if model_args.vae_name_or_path is not None else
-            os.path.join(model_args.pretrained_model_name_or_path, "vae"))
+            if model_args.vae_name_or_path is not None
+            else os.path.join(model_args.pretrained_model_name_or_path, "vae")
+        )
         text_encoder_name_or_path = (
             model_args.text_encoder_name_or_path
-            if model_args.text_encoder_name_or_path is not None else
-            os.path.join(model_args.pretrained_model_name_or_path,
-                         "text_encoder"))
+            if model_args.text_encoder_name_or_path is not None
+            else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+        )
         unet_name_or_path = (
             model_args.unet_name_or_path
-            if model_args.unet_name_or_path is not None else
-            os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+            if model_args.unet_name_or_path is not None
+            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+        )
         # init model and tokenizer
         tokenizer_kwargs = {}
         if model_args.model_max_length is not None:
             tokenizer_kwargs["model_max_length"] = model_args.model_max_length
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
-                                                       **tokenizer_kwargs)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **tokenizer_kwargs)
         self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
-        self.text_encoder = CLIPTextModel.from_pretrained(
-            text_encoder_name_or_path)
+        self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
         try:
             self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
         except Exception:
@@ -88,9 +94,9 @@ def __init__(self, model_args):
             beta_end=0.012,
             beta_schedule="scaled_linear",
             num_train_timesteps=1000,
-            prediction_type=self.model_args.prediction_type, )
-        self.register_buffer("alphas_cumprod",
-                             self.noise_scheduler.alphas_cumprod)
+            prediction_type=self.model_args.prediction_type,
+        )
+        self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
         self.eval_scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
@@ -99,7 +105,8 @@ def __init__(self, model_args):
             clip_sample=False,
             set_alpha_to_one=False,
             steps_offset=1,
-            prediction_type=self.model_args.prediction_type, )
+            prediction_type=self.model_args.prediction_type,
+        )
         self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps)
         self.use_ema = False
         self.model_ema = None
@@ -109,7 +116,7 @@ def compute_snr(self, timesteps):
         Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
         """
         sqrt_alphas_cumprod = self.alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod)**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5
 
         # Expand the tensors.
         # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
@@ -118,15 +125,13 @@ def compute_snr(self, timesteps):
             sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
         alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
 
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[
-            timesteps].cast("float32")
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32")
         while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
-                                                                          None]
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
         sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
 
         # Compute SNR.
-        snr = (alpha / sigma)**2
+        snr = (alpha / sigma) ** 2
         return snr
 
     def forward(self, input_ids=None, pixel_values=None, **kwargs):
@@ -143,14 +148,14 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
         if self.model_args.noise_offset:
             # https://www.crosslabs.org//blog/diffusion-with-offset-noise
             noise += self.model_args.noise_offset * paddle.randn(
-                (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype)
+                (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype
+            )
         if self.model_args.input_perturbation:
-            new_noise = noise + self.model_args.input_perturbation * paddle.randn(
-                noise.shape, dtype=noise.dtype)
+            new_noise = noise + self.model_args.input_perturbation * paddle.randn(noise.shape, dtype=noise.dtype)
 
-        timesteps = paddle.randint(
-            0, self.noise_scheduler.config.num_train_timesteps,
-            (latents.shape[0], )).cast("int64")
+        timesteps = paddle.randint(0, self.noise_scheduler.config.num_train_timesteps, (latents.shape[0],)).cast(
+            "int64"
+        )
         # Add noise to the latents according to the noise magnitude at each timestep
         # (this is the forward diffusion process)
         if self.model_args.input_perturbation:
@@ -165,7 +170,8 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
         model_pred = self.unet(
             sample=noisy_latents,
             timestep=timesteps,
-            encoder_hidden_states=encoder_hidden_states, ).sample
+            encoder_hidden_states=encoder_hidden_states,
+        ).sample
 
         # Get the target for loss depending on the prediction type
         if self.model_args.prediction_type == "epsilon":
@@ -173,64 +179,58 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
         elif self.model_args.prediction_type == "v_prediction":
             target = self.get_velocity(latents, noise, timesteps)
         else:
-            raise ValueError(
-                f"Unknown prediction type {self.model_args.prediction_type}")
+            raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}")
 
         # compute loss
         if self.model_args.snr_gamma is None:
-            loss = (F.mse_loss(
-                model_pred.cast("float32"),
-                target.cast("float32"),
-                reduction="none").mean([1, 2, 3]).mean())
+            loss = (
+                F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean()
+            )
         else:
             # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
             # Since we predict the noise instead of x_0, the original formulation is slightly changed.
             # This is discussed in Section 4.2 of the same paper.
             snr = self.compute_snr(timesteps)
-            mse_loss_weights = (paddle.stack(
-                [snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)],
-                axis=1, ).min(axis=1)[0] / snr)
+            mse_loss_weights = (
+                paddle.stack([snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)], axis=1,).min(
+                    axis=1
+                )[0]
+                / snr
+            )
             # We first calculate the original loss. Then we mean over the non-batch dimensions and
             # rebalance the sample-wise losses with their respective loss weights.
             # Finally, we take the mean of the rebalanced loss.
-            loss = F.mse_loss(
-                model_pred.cast("float32"),
-                target.cast("float32"),
-                reduction="none")
+            loss = F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none")
             loss = loss.mean(list(range(1, len(loss.shape)))) * mse_loss_weights
             loss = loss.mean()
         return loss
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    def get_velocity(self,
-                     sample: paddle.Tensor,
-                     noise: paddle.Tensor,
-                     timesteps: paddle.Tensor) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(sample.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
         while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -285,20 +285,19 @@ def decode_image(self, pixel_values=None, max_batch=8, **kwargs):
 
     @paddle.no_grad()
     def log_image(
-            self,
-            input_ids=None,
-            height=256,
-            width=256,
-            eta=0.0,
-            guidance_scale=7.5,
-            max_batch=8,
-            **kwargs, ):
+        self,
+        input_ids=None,
+        height=256,
+        width=256,
+        eta=0.0,
+        guidance_scale=7.5,
+        max_batch=8,
+        **kwargs,
+    ):
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log max_batch image
             if input_ids.shape[0] > max_batch:
                 input_ids = input_ids[:max_batch]
@@ -311,34 +310,25 @@ def log_image(
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
                 uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings], axis=0)
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
 
-            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels,
-                                    height // 8, width // 8))
+            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
             latents = latents * self.eval_scheduler.init_noise_sigma
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
             for t in self.eval_scheduler.timesteps:
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings).sample
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                latents = self.eval_scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
             latents = 1 / self.vae.config.scaling_factor * latents
             image = self.vae.decode(latents).sample
             image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0
@@ -347,8 +337,7 @@ def log_image(
     def set_recompute(self, use_recompute=False):
         if use_recompute:
             self.unet.enable_gradient_checkpointing()
-            if self.model_args.train_text_encoder and hasattr(
-                    self.text_encoder, "gradient_checkpointing_enable"):
+            if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"):
                 self.text_encoder.gradient_checkpointing_enable()
 
     def gradient_checkpointing_enable(self):
@@ -362,26 +351,21 @@ def set_xformers(self, use_xformers=False):
                 )
             else:
                 try:
-                    attention_op = os.getenv("FLAG_XFORMERS_ATTENTION_OP",
-                                             "none").lower()
+                    attention_op = os.getenv("FLAG_XFORMERS_ATTENTION_OP", "none").lower()
 
                     if attention_op == "none":
                         attention_op = None
 
-                    self.unet.enable_xformers_memory_efficient_attention(
-                        attention_op)
-                    if hasattr(self.vae,
-                               "enable_xformers_memory_efficient_attention"):
-                        self.vae.enable_xformers_memory_efficient_attention(
-                            attention_op)
-                    if hasattr(self.text_encoder,
-                               "enable_xformers_memory_efficient_attention"):
-                        self.text_encoder.enable_xformers_memory_efficient_attention(
-                            attention_op)
+                    self.unet.enable_xformers_memory_efficient_attention(attention_op)
+                    if hasattr(self.vae, "enable_xformers_memory_efficient_attention"):
+                        self.vae.enable_xformers_memory_efficient_attention(attention_op)
+                    if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"):
+                        self.text_encoder.enable_xformers_memory_efficient_attention(attention_op)
                 except Exception as e:
                     logger.warn(
                         "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                        f" correctly and a GPU is available: {e}")
+                        f" correctly and a GPU is available: {e}"
+                    )
 
     def set_ema(self, use_ema=False):
         self.use_ema = use_ema
diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
index d15e6b0894fe4..4ca34e749fc3f 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
@@ -34,39 +34,34 @@
 
 @dataclass
 class SDTrainingArguments(TrainingArguments):
-    image_logging_steps: int = field(
-        default=1000, metadata={"help": "Log image every X steps."})
-    to_static: bool = field(
-        default=False, metadata={"help": "Whether or not to_static"})
+    image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."})
+    to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"})
     benchmark: bool = field(
         default=False,
-        metadata={"help": "Whether or not run benchmark."}, )
+        metadata={"help": "Whether or not run benchmark."},
+    )
     profiler_options: Optional[str] = field(
         default=None,
-        metadata={"help": "profiler_options."}, )
+        metadata={"help": "profiler_options."},
+    )
     report_to: Optional[List[str]] = field(
         default_factory=lambda: ["custom_visualdl"],
-        metadata={
-            "help":
-            "The list of integrations to report the results and logs to."
-        }, )
+        metadata={"help": "The list of integrations to report the results and logs to."},
+    )
     resolution: int = field(
         default=512,
         metadata={
-            "help":
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        }, )
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+        },
+    )
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable_xformers_memory_efficient_attention."})
+        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+    )
     only_save_updated_model: bool = field(
-        default=True,
-        metadata={"help": "Whether or not save only_save_updated_model"})
-    unet_learning_rate: float = field(
-        default=None,
-        metadata={"help": "The initial learning rate for Unet Model."})
+        default=True, metadata={"help": "Whether or not save only_save_updated_model"}
+    )
+    unet_learning_rate: float = field(default=None, metadata={"help": "The initial learning rate for Unet Model."})
     text_encoder_learning_rate: float = field(
         default=None,
         metadata={"help": "The initial learning rate for Text Encoder Model."},
@@ -75,19 +70,17 @@ class SDTrainingArguments(TrainingArguments):
     def __post_init__(self):
         super().__post_init__()
         self.image_logging_steps = (
-            (math.ceil(self.image_logging_steps / self.logging_steps) *
-             self.logging_steps) if self.image_logging_steps > 0 else -1)
-        self.use_ema = str2bool(os.getenv("FLAG_USE_EMA",
-                                          "False")) or self.use_ema
+            (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps)
+            if self.image_logging_steps > 0
+            else -1
+        )
+        self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema
         self.enable_xformers_memory_efficient_attention = (
-            str2bool(os.getenv("FLAG_XFORMERS", "False")) or
-            self.enable_xformers_memory_efficient_attention)
-        self.recompute = (str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or
-                          self.recompute)
-        self.benchmark = (str2bool(os.getenv("FLAG_BENCHMARK", "False")) or
-                          self.benchmark)
-        self.to_static = (str2bool(os.getenv("FLAG_TO_STATIC", "False")) or
-                          self.to_static)
+            str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention
+        )
+        self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute
+        self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark
+        self.to_static = str2bool(os.getenv("FLAG_TO_STATIC", "False")) or self.to_static
 
         if self.text_encoder_learning_rate is None:
             self.text_encoder_learning_rate = self.learning_rate
@@ -105,45 +98,34 @@ def __post_init__(self):
 
 @dataclass
 class SDModelArguments:
-    vae_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "unet_name_or_path"})
+    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"})
+    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"})
     tokenizer_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"
-        }, )
+        metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"},
+    )
     pretrained_model_name_or_path: str = field(
         default="CompVis/stable-diffusion-v1-4",
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
-    model_max_length: int = field(
-        default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
+    model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
     prediction_type: str = field(
         default="epsilon",
         metadata={
-            "help":
-            "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
-        }, )
-    num_inference_steps: int = field(
-        default=50, metadata={"help": "num_inference_steps"})
-    train_text_encoder: bool = field(
-        default=False, metadata={"help": "Whether or not train text encoder"})
-
-    noise_offset: float = field(
-        default=0, metadata={"help": "The scale of noise offset."})
+            "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
+        },
+    )
+    num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
+    train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"})
+
+    noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."})
     snr_gamma: Optional[float] = field(
         default=None,
         metadata={
-            "help":
-            "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
-        }, )
+            "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
+        },
+    )
     input_perturbation: Optional[float] = field(
         default=0,
         metadata={"help": "The scale of input perturbation. Recommended 0.1."},
@@ -158,14 +140,18 @@ class SDDataArguments:
 
     file_list: str = field(
         default="./data/filelist/train.filelist.list",
-        metadata={"help": "The name of the file_list."}, )
+        metadata={"help": "The name of the file_list."},
+    )
     num_records: int = field(default=10000000, metadata={"help": "num_records"})
     buffer_size: int = field(
         default=100,
-        metadata={"help": "Buffer size"}, )
+        metadata={"help": "Buffer size"},
+    )
     shuffle_every_n_samples: int = field(
         default=5,
-        metadata={"help": "shuffle_every_n_samples."}, )
+        metadata={"help": "shuffle_every_n_samples."},
+    )
     interpolation: str = field(
         default="lanczos",
-        metadata={"help": "interpolation method"}, )
+        metadata={"help": "interpolation method"},
+    )
diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
index 042f4f9410724..0ef65c15cac26 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
@@ -22,7 +22,11 @@
 from paddle.io import DataLoader
 from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer
 from paddlenlp.trainer.integrations import (
-    INTEGRATION_TO_CALLBACK, TrainerCallback, VisualDLCallback, rewrite_logs)
+    INTEGRATION_TO_CALLBACK,
+    TrainerCallback,
+    VisualDLCallback,
+    rewrite_logs,
+)
 from paddlenlp.transformers.model_utils import _add_variant
 from paddlenlp.utils import profiler
 from paddlenlp.utils.log import logger
@@ -58,19 +62,17 @@ def autocast_smart_context_manager(self, args):
                 custom_black_list=set(custom_black_list),
                 custom_white_list=set(custom_white_list),
                 level=args.fp16_opt_level,
-                dtype=amp_dtype, )
+                dtype=amp_dtype,
+            )
         else:
-            ctx_manager = (contextlib.nullcontext()
-                           if sys.version_info >= (3, 7) else
-                           contextlib.suppress())
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
         return ctx_manager
 
     def on_step_end(self, args, state, control, model=None, **kwargs):
         if hasattr(model, "on_train_batch_end"):
             model.on_train_batch_end()
-        if (args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
             control.should_log = True
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -78,26 +80,32 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         inputs = kwargs.get("inputs", None)
         model = kwargs.get("model", None)
         image_logs = {}
-        if (inputs is not None and model is not None and
-                args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if (
+            inputs is not None
+            and model is not None
+            and args.image_logging_steps > 0
+            and state.global_step % args.image_logging_steps == 0
+        ):
 
             with self.autocast_smart_context_manager(args):
                 max_batch = 4 if args.resolution > 256 else 8
                 image_logs["reconstruction"] = model.decode_image(
-                    pixel_values=inputs["pixel_values"], max_batch=max_batch)
+                    pixel_values=inputs["pixel_values"], max_batch=max_batch
+                )
                 image_logs["ddim-samples-1.0"] = model.log_image(
                     input_ids=inputs["input_ids"],
                     guidance_scale=1.0,
                     height=args.resolution,
                     width=args.resolution,
-                    max_batch=max_batch, )
+                    max_batch=max_batch,
+                )
                 image_logs["ddim-samples-7.5"] = model.log_image(
                     input_ids=inputs["input_ids"],
                     guidance_scale=7.5,
                     height=args.resolution,
                     width=args.resolution,
-                    max_batch=max_batch, )
+                    max_batch=max_batch,
+                )
 
         if not state.is_world_process_zero:
             return
@@ -110,10 +118,8 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             logs["unet_lr"] = base_learning_rate
             if model.train_text_encoder:
                 if args.text_encoder_learning_rate != args.unet_learning_rate:
-                    logs[
-                        "unet_lr"] = base_learning_rate * args.unet_learning_rate
-                    logs["text_encoder_lr"] = (base_learning_rate *
-                                               args.text_encoder_learning_rate)
+                    logs["unet_lr"] = base_learning_rate * args.unet_learning_rate
+                    logs["text_encoder_lr"] = base_learning_rate * args.text_encoder_learning_rate
                 else:
                     logs["text_encoder_lr"] = base_learning_rate
 
@@ -127,11 +133,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         "Trainer is attempting to log a value of "
                         f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                         "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute.")
+                        "is incorrect so we dropped this attribute."
+                    )
             # log images
             for k, v in image_logs.items():
-                self.vdl_writer.add_image(
-                    k, v, state.global_step, dataformats="NHWC")
+                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
             self.vdl_writer.flush()
 
 
@@ -172,8 +178,7 @@ def __init__(self, benchmark=True, profiler_options=None):
         self.profiler_options = profiler_options
 
     def on_train_begin(self, args, state, control, **kwargs):
-        assert (args.gradient_accumulation_steps == 1 and not args.do_eval and
-                not args.do_predict)
+        assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
         if self.benchmark:
             self.reader_cost_avg = AverageStatistical()
 
@@ -198,8 +203,7 @@ def on_step_end(self, args, state, control, **kwargs):
     def on_log(self, args, state, control, logs=None, **kwargs):
         if self.benchmark:
             if logs is not None and "interval_steps_per_second" in logs:
-                self.batch_start = self.batch_start + (
-                    time.time() - self.maybe_log_save_evaluate_start)
+                self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
                 ips = logs["interval_steps_per_second"] * args.train_batch_size
                 avg_batch_cost = 1 / logs["interval_steps_per_second"]
                 logger.info(
@@ -211,14 +215,15 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         self.reader_cost_avg.get_average(),
                         avg_batch_cost,
                         args.train_batch_size,
-                        ips, ))
+                        ips,
+                    )
+                )
                 self.reader_cost_avg.reset()
 
     def on_epoch_end(self, args, state, control, **kwargs):
         if self.benchmark:
             train_epoch_cost = time.time() - self.epoch_start
-            logger.info("train epoch: %d, epoch_cost: %.5f s" %
-                        (state.epoch, train_epoch_cost))
+            logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
 
 
 # register visualdl_with_image
@@ -232,7 +237,9 @@ def __init__(self, **kwargs):
             self.add_callback(
                 BenchmarkCallback(
                     benchmark=self.args.benchmark,
-                    profiler_options=self.args.profiler_options, ))
+                    profiler_options=self.args.profiler_options,
+                )
+            )
             if self.args.benchmark:
                 if self.args.disable_tqdm:
                     self.pop_callback(PrinterCallback)
@@ -251,34 +258,27 @@ def get_train_dataloader(self):
                 self.train_dataset,
                 batch_size=self.args.train_batch_size,
                 num_workers=self.args.dataloader_num_workers,
-                worker_init_fn=worker_init_fn, )
+                worker_init_fn=worker_init_fn,
+            )
         else:
             return super().get_train_dataloader()
 
-    def _save(self,
-              output_dir=None,
-              state_dict=None,
-              merge_tensor_parallel=False):
+    def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
         output_dir = output_dir if output_dir is not None else self.args.output_dir
         os.makedirs(output_dir, exist_ok=True)
         if self.args.only_save_updated_model:
             unwraped_model = unwrap_model(self.model)
             logger.info(f"Saving unet checkpoint to {output_dir}/unet")
-            unwraped_model.unet.save_pretrained(
-                os.path.join(output_dir, "unet"))
+            unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"))
 
             if unwraped_model.use_ema:
                 logger.info(f"Saving ema unet checkpoint to {output_dir}/unet")
                 with unwraped_model.ema_scope():
-                    unwraped_model.unet.save_pretrained(
-                        os.path.join(output_dir, "unet"), variant="ema")
+                    unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema")
 
             if unwraped_model.train_text_encoder:
-                logger.info(
-                    f"Saving text encoder checkpoint to {output_dir}/text_encoder"
-                )
-                unwraped_model.text_encoder.save_pretrained(
-                    os.path.join(output_dir, "text_encoder"))
+                logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder")
+                unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder"))
         else:
             logger.info(f"Saving model checkpoint to {output_dir}")
             if state_dict is None:
@@ -287,10 +287,10 @@ def _save(self,
                 state_dict,
                 os.path.join(
                     output_dir,
-                    _add_variant(PADDLE_WEIGHTS_NAME,
-                                 self.args.weight_name_suffix), ), )
+                    _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix),
+                ),
+            )
             if self.args.should_save:
                 if self.tokenizer is not None:
                     self.tokenizer.save_pretrained(output_dir)
-                paddle.save(self.args,
-                            os.path.join(output_dir, TRAINING_ARGS_NAME))
+                paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
index 82d71e6c5f816..b41f0b799469f 100644
--- a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
@@ -46,8 +46,7 @@ def parse_src(filename):
         elif data_source == "laion_aes":
             text_json = json.loads(vec[2])
             img_b64 = vec[5]
-            caption = text_json.get("caption_en",
-                                    text_json.get("blip_caption_en", ""))
+            caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
         else:
             _, captions, _, _, _, img_b64 = vec[:6]
             caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
@@ -77,23 +76,26 @@ def _get_param(self, img, output_size):
 
 class TextImagePair(IterableDataset):
     def __init__(
-            self,
-            file_list,
-            size,
-            num_records,
-            image_processing=None,
-            buffer_size=1000,
-            shuffle_every_n_samples=5,
-            interpolation="lanczos",
-            tokenizer=None, ):
+        self,
+        file_list,
+        size,
+        num_records,
+        image_processing=None,
+        buffer_size=1000,
+        shuffle_every_n_samples=5,
+        interpolation="lanczos",
+        tokenizer=None,
+    ):
         self.size = size
         if image_processing is None:
-            self.image_processing = transforms.Compose([
-                transforms.Resize(int(size / 0.9), interpolation),
-                RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize(0.5, 0.5),
-            ])
+            self.image_processing = transforms.Compose(
+                [
+                    transforms.Resize(int(size / 0.9), interpolation),
+                    RandomCrop(size),
+                    transforms.ToTensor(),
+                    transforms.Normalize(0.5, 0.5),
+                ]
+            )
         else:
             self.image_processing = image_processing
         self.text_processing = lambda caption: tokenizer(
@@ -101,7 +103,8 @@ def __init__(
             padding="max_length",
             truncation=True,
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids[0]
+            return_tensors="pd",
+        ).input_ids[0]
         self.file_list = []
         file_weights = []
         with open(file_list, "r") as f:
@@ -122,19 +125,14 @@ def __init__(
             file_weights = file_weights / file_weight_sum
             print(f"sample weights of files: {file_weights}")
             self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate(
-                [[0.0], self.file_weights_cumsum])
+            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
         else:
             print("sample each file list with same probabiliy")
             self.file_weights_cumsum = None
 
         self.num_records = num_records
-        self.file_ids = [
-            np.arange(len(filelist)) for filelist in self.file_list
-        ]
-        print(
-            f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
-        )
+        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
         self.buffer_size = buffer_size
         self.shuffle_every_n_samples = shuffle_every_n_samples
 
@@ -143,9 +141,7 @@ def sample_loader(self, file_ids, filenames):
             random.shuffle(file_ids)
             for i in file_ids:
                 filename = filenames[i].strip("\n")
-                with gzip.open(filename,
-                               "rb") if filename.endswith(".gz") else open(
-                                   filename, "rb") as f:
+                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
                     # retry = 0
                     while True:
                         line = f.readline()
@@ -171,19 +167,14 @@ def sample_loader(self, file_ids, filenames):
                             if w < self.size or h < self.size:
                                 continue
                             yield {
-                                "pixel_values":
-                                self.image_processing(data["image"]),
-                                "input_ids":
-                                self.text_processing(data["caption"]),
+                                "pixel_values": self.image_processing(data["image"]),
+                                "input_ids": self.text_processing(data["caption"]),
                             }
 
     def random_load_from_multi_dataset(self):
-        print(
-            f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
-        )
+        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
         sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
-            for i in range(len(self.file_ids))
+            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
         ]
 
         while True:
@@ -192,8 +183,7 @@ def random_load_from_multi_dataset(self):
             else:
                 rand_num = random.random()
                 for i in range(len(self.file_list)):
-                    if (self.file_weights_cumsum[i] <= rand_num <
-                            self.file_weights_cumsum[i + 1]):
+                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
                         break
                 sample_loader = sample_loader_per_dataset[i]
                 # debug
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py
index a50d56e2b5b11..a7afb1ddf6c41 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py
@@ -17,8 +17,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from diffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                       UNet2DConditionModel)
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, UNet2DConditionModel
 from transformers import AutoTokenizer, CLIPTextModel
 from transformers.utils.logging import get_logger
 
@@ -35,9 +34,8 @@ def __init__(self, model, decay=0.9999, use_num_upates=True):
         self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32))
         self.register_buffer(
             "num_updates",
-            torch.tensor(
-                0, dtype=torch.int) if use_num_upates else torch.tensor(
-                    -1, dtype=torch.int), )
+            torch.tensor(0, dtype=torch.int) if use_num_upates else torch.tensor(-1, dtype=torch.int),
+        )
 
         for name, p in model.named_parameters():
             if p.requires_grad:
@@ -53,8 +51,7 @@ def forward(self, model):
 
         if self.num_updates >= 0:
             self.num_updates += 1
-            decay = min(self.decay,
-                        (1 + self.num_updates) / (10 + self.num_updates))
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
 
         one_minus_decay = 1.0 - decay
 
@@ -65,10 +62,8 @@ def forward(self, model):
             for key in m_param:
                 if m_param[key].requires_grad:
                     sname = self.m_name2s_name[key]
-                    shadow_params[sname] = shadow_params[sname].type_as(m_param[
-                        key])
-                    shadow_params[sname].sub_(
-                        one_minus_decay * (shadow_params[sname] - m_param[key]))
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
                 else:
                     assert key not in self.m_name2s_name
 
@@ -77,8 +72,7 @@ def copy_to(self, model):
         shadow_params = dict(self.named_buffers())
         for key in m_param:
             if m_param[key].requires_grad:
-                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]]
-                                        .data)
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
             else:
                 assert key not in self.m_name2s_name
 
@@ -89,9 +83,7 @@ def store(self, parameters):
           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
             temporarily stored.
         """
-        self.collected_params = [
-            param.detach().cpu().clone() for param in parameters
-        ]
+        self.collected_params = [param.detach().cpu().clone() for param in parameters]
 
     def restore(self, parameters):
         """
@@ -113,19 +105,26 @@ class StableDiffusionModel(nn.Module):
     def __init__(self, model_args):
         super().__init__()
         self.model_args = model_args
-        tokenizer_name_or_path = (model_args.tokenizer_name
-                                  if model_args.tokenizer_name is not None else
-                                  model_args.pretrained_model_name_or_path)
-        vae_name_or_path = (model_args.vae_name_or_path
-                            if model_args.vae_name_or_path is not None else
-                            model_args.pretrained_model_name_or_path)
+        tokenizer_name_or_path = (
+            model_args.tokenizer_name
+            if model_args.tokenizer_name is not None
+            else model_args.pretrained_model_name_or_path
+        )
+        vae_name_or_path = (
+            model_args.vae_name_or_path
+            if model_args.vae_name_or_path is not None
+            else model_args.pretrained_model_name_or_path
+        )
         text_encoder_name_or_path = (
             model_args.text_encoder_name_or_path
-            if model_args.text_encoder_name_or_path is not None else
-            model_args.pretrained_model_name_or_path)
-        unet_name_or_path = (model_args.unet_name_or_path
-                             if model_args.unet_name_or_path is not None else
-                             model_args.pretrained_model_name_or_path)
+            if model_args.text_encoder_name_or_path is not None
+            else model_args.pretrained_model_name_or_path
+        )
+        unet_name_or_path = (
+            model_args.unet_name_or_path
+            if model_args.unet_name_or_path is not None
+            else model_args.pretrained_model_name_or_path
+        )
         # init model and tokenizer
         tokenizer_kwargs = {}
         if model_args.model_max_length is not None:
@@ -134,14 +133,12 @@ def __init__(self, model_args):
             tokenizer_name_or_path,
             **tokenizer_kwargs,
             subfolder="tokenizer",
-            use_fast=False, )
-        self.vae = AutoencoderKL.from_pretrained(
-            vae_name_or_path, subfolder="vae")
-        self.text_encoder = CLIPTextModel.from_pretrained(
-            text_encoder_name_or_path, subfolder="text_encoder")
+            use_fast=False,
+        )
+        self.vae = AutoencoderKL.from_pretrained(vae_name_or_path, subfolder="vae")
+        self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path, subfolder="text_encoder")
         try:
-            self.unet = UNet2DConditionModel.from_pretrained(
-                unet_name_or_path, subfolder="unet")
+            self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path, subfolder="unet")
         except Exception:
             self.unet = UNet2DConditionModel.from_config(unet_name_or_path)
             logger.info("Init unet model from scratch!")
@@ -166,9 +163,9 @@ def __init__(self, model_args):
             beta_end=0.012,
             beta_schedule="scaled_linear",
             num_train_timesteps=1000,
-            prediction_type=self.model_args.prediction_type, )
-        self.register_buffer("alphas_cumprod",
-                             self.noise_scheduler.alphas_cumprod)
+            prediction_type=self.model_args.prediction_type,
+        )
+        self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
         self.eval_scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
@@ -177,7 +174,8 @@ def __init__(self, model_args):
             clip_sample=False,
             set_alpha_to_one=False,
             steps_offset=1,
-            prediction_type=self.model_args.prediction_type, )
+            prediction_type=self.model_args.prediction_type,
+        )
         self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps)
         self.use_ema = False
         self.model_ema = None
@@ -187,25 +185,22 @@ def compute_snr(self, timesteps):
         Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
         """
         sqrt_alphas_cumprod = self.alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod)**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5
 
         # Expand the tensors.
         # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(
-            device=timesteps.device)[timesteps].float()
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
         while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
             sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
         alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
 
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
-            device=timesteps.device)[timesteps].float()
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
         while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
-                                                                          None]
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
         sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
 
         # Compute SNR.
-        snr = (alpha / sigma)**2
+        snr = (alpha / sigma) ** 2
         return snr
 
     def forward(self, input_ids=None, pixel_values=None, **kwargs):
@@ -220,17 +215,18 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
             noise += self.model_args.noise_offset * torch.randn(
                 (latents.shape[0], latents.shape[1], 1, 1),
                 dtype=noise.dtype,
-                device=noise.device, )
+                device=noise.device,
+            )
         if self.model_args.input_perturbation:
-            new_noise = noise + self.model_args.input_perturbation * torch.randn_like(
-                noise)
+            new_noise = noise + self.model_args.input_perturbation * torch.randn_like(noise)
 
         timesteps = torch.randint(
             0,
             self.noise_scheduler.config.num_train_timesteps,
-            (latents.shape[0], ),
+            (latents.shape[0],),
             dtype=torch.long,
-            device=latents.device, )
+            device=latents.device,
+        )
         # Add noise to the latents according to the noise magnitude at each timestep
         # (this is the forward diffusion process)
         if self.model_args.input_perturbation:
@@ -239,15 +235,15 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
             noisy_latents = self.add_noise(latents, noise, timesteps)
 
         # text encode
-        encoder_hidden_states = self.text_encoder(
-            input_ids, return_dict=False)[0]
+        encoder_hidden_states = self.text_encoder(input_ids, return_dict=False)[0]
 
         # unet
         model_pred = self.unet(
             sample=noisy_latents,
             timestep=timesteps,
             encoder_hidden_states=encoder_hidden_states,
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
 
         # Get the target for loss depending on the prediction type
         if self.model_args.prediction_type == "epsilon":
@@ -255,62 +251,53 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
         elif self.model_args.prediction_type == "v_prediction":
             target = self.get_velocity(latents, noise, timesteps)
         else:
-            raise ValueError(
-                f"Unknown prediction type {self.model_args.prediction_type}")
+            raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}")
 
         # compute loss
         if self.model_args.snr_gamma is None:
-            loss = (F.mse_loss(
-                model_pred.float(), target.float(), reduction="none")
-                    .mean([1, 2, 3]).mean())
+            loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
         else:
             # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
             # Since we predict the noise instead of x_0, the original formulation is slightly changed.
             # This is discussed in Section 4.2 of the same paper.
             snr = self.compute_snr(timesteps)
-            mse_loss_weights = (torch.stack(
-                [snr, self.model_args.snr_gamma * torch.ones_like(timesteps)],
-                dim=1).min(dim=1)[0] / snr)
+            mse_loss_weights = (
+                torch.stack([snr, self.model_args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+            )
             # We first calculate the original loss. Then we mean over the non-batch dimensions and
             # rebalance the sample-wise losses with their respective loss weights.
             # Finally, we take the mean of the rebalanced loss.
-            loss = F.mse_loss(
-                model_pred.float(), target.float(), reduction="none")
-            loss = loss.mean(
-                dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+            loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+            loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
             loss = loss.mean()
         return loss
 
     def add_noise(
-            self,
-            original_samples: torch.Tensor,
-            noise: torch.Tensor,
-            timesteps: torch.Tensor, ) -> torch.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    def get_velocity(self,
-                     sample: torch.Tensor,
-                     noise: torch.Tensor,
-                     timesteps: torch.Tensor) -> torch.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(sample.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
         while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -350,20 +337,19 @@ def decode_image(self, pixel_values=None, max_batch=8, **kwargs):
 
     @torch.no_grad()
     def log_image(
-            self,
-            input_ids=None,
-            height=256,
-            width=256,
-            eta=0.0,
-            guidance_scale=7.5,
-            max_batch=8,
-            **kwargs, ):
+        self,
+        input_ids=None,
+        height=256,
+        width=256,
+        eta=0.0,
+        guidance_scale=7.5,
+        max_batch=8,
+        **kwargs,
+    ):
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log max_batch image
             if input_ids.shape[0] > max_batch:
                 input_ids = input_ids[:max_batch]
@@ -376,44 +362,40 @@ def log_image(
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
-                    return_tensors="pt", )
+                    return_tensors="pt",
+                )
                 uncond_embeddings = self.text_encoder(
                     uncond_input.input_ids.to(device=input_ids.device),
-                    return_dict=False, )[0]
-                text_embeddings = torch.cat(
-                    [uncond_embeddings, text_embeddings], dim=0)
-
-            latents = torch.randn((
-                input_ids.shape[0],
-                self.unet.config.in_channels,
-                height // 8,
-                width // 8, )).to(device=input_ids.device)
+                    return_dict=False,
+                )[0]
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings], dim=0)
+
+            latents = torch.randn(
+                (
+                    input_ids.shape[0],
+                    self.unet.config.in_channels,
+                    height // 8,
+                    width // 8,
+                )
+            ).to(device=input_ids.device)
             latents = latents * self.eval_scheduler.init_noise_sigma
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
             for t in self.eval_scheduler.timesteps:
-                latent_model_input = (torch.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = torch.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=text_embeddings,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                latents = self.eval_scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False)[0]
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
             latents = 1 / self.vae.config.scaling_factor * latents
             image = self.vae.decode(latents).sample
             image = (image / 2 + 0.5).clamp(0, 1).permute(0, 2, 3, 1) * 255.0
@@ -422,8 +404,7 @@ def log_image(
     def set_recompute(self, use_recompute=False):
         if use_recompute:
             self.unet.enable_gradient_checkpointing()
-            if self.model_args.train_text_encoder and hasattr(
-                    self.text_encoder, "gradient_checkpointing_enable"):
+            if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"):
                 self.text_encoder.gradient_checkpointing_enable()
 
     def gradient_checkpointing_enable(self):
@@ -433,17 +414,15 @@ def set_xformers(self, use_xformers=False):
         if use_xformers:
             try:
                 self.unet.enable_xformers_memory_efficient_attention()
-                if hasattr(self.vae,
-                           "enable_xformers_memory_efficient_attention"):
+                if hasattr(self.vae, "enable_xformers_memory_efficient_attention"):
                     self.vae.enable_xformers_memory_efficient_attention()
-                if hasattr(self.text_encoder,
-                           "enable_xformers_memory_efficient_attention"):
-                    self.text_encoder.enable_xformers_memory_efficient_attention(
-                    )
+                if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"):
+                    self.text_encoder.enable_xformers_memory_efficient_attention()
             except Exception as e:
                 logger.warn(
                     "Could not enable memory efficient attention. Make sure develop torchtorch is installed"
-                    f" correctly and a GPU is available: {e}")
+                    f" correctly and a GPU is available: {e}"
+                )
 
     def set_ema(self, use_ema=False):
         self.use_ema = use_ema
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py
index 4efe98bed8a65..b49d994418a77 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py
@@ -46,63 +46,58 @@ def str2bool(v):
 
 if not str2bool(os.getenv("FLAG_SDP", "True")):
     if hasattr(torch.nn.functional, "scaled_dot_product_attention"):
-        torch.nn.functional.scaled_dot_product_attention_ = (
-            torch.nn.functional.scaled_dot_product_attention)
+        torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
         del torch.nn.functional.scaled_dot_product_attention
-        print(
-            "Removed `torch.nn.functional.scaled_dot_product_attention`, we will use default attention implement."
-        )
+        print("Removed `torch.nn.functional.scaled_dot_product_attention`, we will use default attention implement.")
 
 
 @dataclass
 class SDTrainingArguments(TrainingArguments):
-    image_logging_steps: int = field(
-        default=1000, metadata={"help": "Log image every X steps."})
+    image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."})
     recompute: bool = field(
         default=False,
-        metadata={"help": "Whether or not run recompute."}, )
+        metadata={"help": "Whether or not run recompute."},
+    )
     benchmark: bool = field(
         default=False,
-        metadata={"help": "Whether or not run benchmark."}, )
+        metadata={"help": "Whether or not run benchmark."},
+    )
     report_to: Optional[List[str]] = field(
         default_factory=lambda: ["custom_visualdl"],
-        metadata={
-            "help":
-            "The list of integrations to report the results and logs to."
-        }, )
+        metadata={"help": "The list of integrations to report the results and logs to."},
+    )
     resolution: int = field(
         default=512,
         metadata={
-            "help":
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        }, )
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+        },
+    )
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable_xformers_memory_efficient_attention."})
+        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+    )
     only_save_updated_model: bool = field(
-        default=True,
-        metadata={"help": "Whether or not save only_save_updated_model"})
+        default=True, metadata={"help": "Whether or not save only_save_updated_model"}
+    )
     log_level: str = field(
         default="info",
-        metadata={"help": "log_level."}, )
+        metadata={"help": "log_level."},
+    )
 
     def __post_init__(self):
         super().__post_init__()
         self.image_logging_steps = (
-            (math.ceil(self.image_logging_steps / self.logging_steps) *
-             self.logging_steps) if self.image_logging_steps > 0 else -1)
-        self.use_ema = str2bool(os.getenv("FLAG_USE_EMA",
-                                          "False")) or self.use_ema
+            (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps)
+            if self.image_logging_steps > 0
+            else -1
+        )
+        self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema
         self.enable_xformers_memory_efficient_attention = (
-            str2bool(os.getenv("FLAG_XFORMERS", "False")) or
-            self.enable_xformers_memory_efficient_attention)
-        self.recompute = (str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or
-                          self.recompute)
+            str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention
+        )
+        self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute
         self.gradient_checkpointing = self.gradient_checkpointing or self.recompute
-        self.benchmark = (str2bool(os.getenv("FLAG_BENCHMARK", "False")) or
-                          self.benchmark)
+        self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark
 
     def print_config(self, args=None, key=""):
         """
@@ -115,8 +110,7 @@ def print_config(self, args=None, key=""):
 
         logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
         logger.info("{:30}: {}".format("torch version", torch.__version__))
-        logger.info("{:30}: {}".format("torch commit id",
-                                       torch.version.git_version))
+        logger.info("{:30}: {}".format("torch commit id", torch.version.git_version))
 
         for a in dir(args):
             if a[:2] != "__":  # don't print double underscore methods
@@ -129,45 +123,34 @@ def print_config(self, args=None, key=""):
 
 @dataclass
 class SDModelArguments:
-    vae_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "unet_name_or_path"})
+    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"})
+    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"})
     tokenizer_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"
-        }, )
+        metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"},
+    )
     pretrained_model_name_or_path: str = field(
         default="CompVis/stable-diffusion-v1-4",
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
-    model_max_length: int = field(
-        default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
+    model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
     prediction_type: str = field(
         default="epsilon",
         metadata={
-            "help":
-            "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
-        }, )
-    num_inference_steps: int = field(
-        default=50, metadata={"help": "num_inference_steps"})
-    train_text_encoder: bool = field(
-        default=False, metadata={"help": "Whether or not train text encoder"})
-
-    noise_offset: float = field(
-        default=0, metadata={"help": "The scale of noise offset."})
+            "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
+        },
+    )
+    num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
+    train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"})
+
+    noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."})
     snr_gamma: Optional[float] = field(
         default=None,
         metadata={
-            "help":
-            "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
-        }, )
+            "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
+        },
+    )
     input_perturbation: Optional[float] = field(
         default=0,
         metadata={"help": "The scale of input perturbation. Recommended 0.1."},
@@ -182,14 +165,18 @@ class SDDataArguments:
 
     file_list: str = field(
         default="./data/filelist/train.filelist.list",
-        metadata={"help": "The name of the file_list."}, )
+        metadata={"help": "The name of the file_list."},
+    )
     num_records: int = field(default=10000000, metadata={"help": "num_records"})
     buffer_size: int = field(
         default=100,
-        metadata={"help": "Buffer size"}, )
+        metadata={"help": "Buffer size"},
+    )
     shuffle_every_n_samples: int = field(
         default=5,
-        metadata={"help": "shuffle_every_n_samples."}, )
+        metadata={"help": "shuffle_every_n_samples."},
+    )
     interpolation: str = field(
         default="lanczos",
-        metadata={"help": "interpolation method"}, )
+        metadata={"help": "interpolation method"},
+    )
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py
index 6420971caadf8..5338a0c72d142 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py
@@ -29,12 +29,13 @@
 
 
 def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs,
-        **kwargs, ):
+    self,
+    args: TrainingArguments,
+    state: TrainerState,
+    control: TrainerControl,
+    logs,
+    **kwargs,
+):
     control.should_log = False
     return self.call_event("on_log", args, state, control, logs=logs, **kwargs)
 
@@ -64,9 +65,7 @@ def __init__(self, vdl_writer=None):
             visualdl
             has_visualdl = False
         if not has_visualdl:
-            raise RuntimeError(
-                "VisualDLWithImageCallback requires visualdl to be installed. Please install visualdl."
-            )
+            raise RuntimeError("VisualDLWithImageCallback requires visualdl to be installed. Please install visualdl.")
         if has_visualdl:
             try:
                 from visualdl import LogWriter
@@ -81,8 +80,7 @@ def __init__(self, vdl_writer=None):
     def on_step_end(self, args, state, control, model=None, **kwargs):
         if hasattr(model, "on_train_batch_end"):
             model.on_train_batch_end()
-        if (args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
             control.should_log = True
 
     def _init_summary_writer(self, args, log_dir=None):
@@ -108,34 +106,38 @@ def on_train_begin(self, args, state, control, **kwargs):
                     self.vdl_writer.add_text("model_config", model_config_json)
 
             if hasattr(self.vdl_writer, "add_hparams"):
-                self.vdl_writer.add_hparams(
-                    args.to_sanitized_dict(), metrics_list=[])
+                self.vdl_writer.add_hparams(args.to_sanitized_dict(), metrics_list=[])
 
     def on_log(self, args, state, control, logs=None, **kwargs):
         # log image on each node
         inputs = kwargs.get("inputs", None)
         model = kwargs.get("model", None)
         image_logs = {}
-        if (inputs is not None and model is not None and
-                args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if (
+            inputs is not None
+            and model is not None
+            and args.image_logging_steps > 0
+            and state.global_step % args.image_logging_steps == 0
+        ):
             max_batch = 4 if args.resolution > 256 else 8
 
             image_logs["reconstruction"] = model.decode_image(
-                pixel_values=inputs["pixel_values"].to(args.device),
-                max_batch=max_batch)
+                pixel_values=inputs["pixel_values"].to(args.device), max_batch=max_batch
+            )
             image_logs["ddim-samples-1.0"] = model.log_image(
                 input_ids=inputs["input_ids"].to(args.device),
                 guidance_scale=1.0,
                 height=args.resolution,
                 width=args.resolution,
-                max_batch=max_batch, )
+                max_batch=max_batch,
+            )
             image_logs["ddim-samples-7.5"] = model.log_image(
                 input_ids=inputs["input_ids"].to(args.device),
                 guidance_scale=7.5,
                 height=args.resolution,
                 width=args.resolution,
-                max_batch=max_batch, )
+                max_batch=max_batch,
+            )
 
         if not state.is_world_process_zero:
             return
@@ -153,11 +155,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         "Trainer is attempting to log a value of "
                         f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                         "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute.")
+                        "is incorrect so we dropped this attribute."
+                    )
             # log images
             for k, v in image_logs.items():
-                self.vdl_writer.add_image(
-                    k, v, state.global_step, dataformats="NHWC")
+                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
             self.vdl_writer.flush()
 
     def on_train_end(self, args, state, control, **kwargs):
@@ -202,8 +204,7 @@ def __init__(self, benchmark=True, **kwargs):
         self.benchmark = benchmark
 
     def on_train_begin(self, args, state, control, **kwargs):
-        assert (args.gradient_accumulation_steps == 1 and not args.do_eval and
-                not args.do_predict)
+        assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
         if self.benchmark:
             self.reader_cost_avg = AverageStatistical()
 
@@ -225,8 +226,7 @@ def on_step_end(self, args, state, control, **kwargs):
     def on_log(self, args, state, control, logs=None, **kwargs):
         if self.benchmark:
             if logs is not None and "interval_steps_per_second" in logs:
-                self.batch_start = self.batch_start + (
-                    time.time() - self.maybe_log_save_evaluate_start)
+                self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
                 ips = logs["interval_steps_per_second"] * args.train_batch_size
                 avg_batch_cost = 1 / logs["interval_steps_per_second"]
                 logger.info(
@@ -238,14 +238,15 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         self.reader_cost_avg.get_average(),
                         avg_batch_cost,
                         args.train_batch_size,
-                        ips, ))
+                        ips,
+                    )
+                )
                 self.reader_cost_avg.reset()
 
     def on_epoch_end(self, args, state, control, **kwargs):
         if self.benchmark:
             train_epoch_cost = time.time() - self.epoch_start
-            logger.info("train epoch: %d, epoch_cost: %.5f s" %
-                        (state.epoch, train_epoch_cost))
+            logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
 
 
 # register visualdl_with_image
@@ -280,22 +281,22 @@ def get_train_dataloader(self):
                 self.train_dataset,
                 batch_size=self._train_batch_size,
                 num_workers=self.args.dataloader_num_workers,
-                worker_init_fn=None
-                if self.args.world_size <= 1 else worker_init_fn, )
+                worker_init_fn=None if self.args.world_size <= 1 else worker_init_fn,
+            )
         else:
             return super().get_train_dataloader()
 
     def _inner_training_loop(
-            self,
-            batch_size=None,
-            args=None,
-            resume_from_checkpoint=None,
-            trial=None,
-            ignore_keys_for_eval=None, ):
+        self,
+        batch_size=None,
+        args=None,
+        resume_from_checkpoint=None,
+        trial=None,
+        ignore_keys_for_eval=None,
+    ):
         self.accelerator.free_memory()
         self._train_batch_size = batch_size
-        logger.debug(
-            f"Currently training with a batch size of: {self._train_batch_size}")
+        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
 
@@ -303,32 +304,27 @@ def _inner_training_loop(
         # number of training epochs: num_train_epochs
         # number of training steps per epoch: num_update_steps_per_epoch
         # total number of training steps to execute: max_steps
-        total_train_batch_size = (args.train_batch_size *
-                                  args.gradient_accumulation_steps *
-                                  args.world_size)
+        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
 
         len_dataloader = None
         if has_length(train_dataloader):
             len_dataloader = len(train_dataloader)
-            num_update_steps_per_epoch = (len_dataloader //
-                                          args.gradient_accumulation_steps)
+            num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
             num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
             num_examples = self.num_examples(train_dataloader)
             if args.max_steps > 0:
                 max_steps = args.max_steps
                 num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
-                    args.max_steps % num_update_steps_per_epoch > 0)
+                    args.max_steps % num_update_steps_per_epoch > 0
+                )
                 # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
                 # the best we can do.
                 num_train_samples = args.max_steps * total_train_batch_size
             else:
-                max_steps = math.ceil(args.num_train_epochs *
-                                      num_update_steps_per_epoch)
+                max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
                 num_train_epochs = math.ceil(args.num_train_epochs)
-                num_train_samples = (self.num_examples(train_dataloader) *
-                                     args.num_train_epochs)
-        elif (args.max_steps >
-              0):  # Rely on max_steps when dataloader does not have a working size
+                num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
+        elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
             max_steps = args.max_steps
             # Setting a very large number of epochs so we go as many times as necessary over the iterator.
             num_train_epochs = sys.maxsize
@@ -338,7 +334,8 @@ def _inner_training_loop(
         else:
             raise ValueError(
                 "args.max_steps must be set to a positive value if dataloader does not have a length, was"
-                f" {args.max_steps}")
+                f" {args.max_steps}"
+            )
 
         # Compute absolute values for logging, eval, and save if given as ratio
         if args.logging_steps and args.logging_steps < 1:
@@ -354,18 +351,20 @@ def _inner_training_loop(
                 # references registered here no longer work on other gpus, breaking the module
                 raise ValueError(
                     "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
-                    " (torch.distributed.launch).")
+                    " (torch.distributed.launch)."
+                )
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
         delay_optimizer_creation = (
-            self.sharded_ddp is not None and
-            self.sharded_ddp != ShardedDDPOption.SIMPLE or
-            is_sagemaker_mp_enabled() or self.fsdp is not None)
+            self.sharded_ddp is not None
+            and self.sharded_ddp != ShardedDDPOption.SIMPLE
+            or is_sagemaker_mp_enabled()
+            or self.fsdp is not None
+        )
 
         if self.is_deepspeed_enabled:
-            self.optimizer, self.lr_scheduler = deepspeed_init(
-                self, num_training_steps=max_steps)
+            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
 
         if not delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
@@ -396,12 +395,12 @@ def _inner_training_loop(
                 if self.use_apex:
                     model = self.accelerator.prepare(self.model)
                 else:
-                    model, self.optimizer = self.accelerator.prepare(
-                        self.model, self.optimizer)
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
             else:
                 # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
                 model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
-                    self.model, self.optimizer, self.lr_scheduler)
+                    self.model, self.optimizer, self.lr_scheduler
+                )
 
         if self.is_fsdp_enabled:
             self.model = model
@@ -417,8 +416,7 @@ def _inner_training_loop(
         self._globalstep_last_start_time = time.time()
         # deepspeed ckpt loading
         if resume_from_checkpoint is not None and self.is_deepspeed_enabled:
-            deepspeed_load_checkpoint(self.model_wrapped,
-                                      resume_from_checkpoint)
+            deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
 
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
@@ -431,19 +429,11 @@ def _inner_training_loop(
         logger.info("***** Running training *****")
         logger.info(f"  Num examples = {num_examples:,}")
         logger.info(f"  Num Epochs = {num_train_epochs:,}")
-        logger.info(
-            f"  Instantaneous batch size per device = {self._train_batch_size:,}"
-        )
-        logger.info(
-            f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}"
-        )
-        logger.info(
-            f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}"
-        )
+        logger.info(f"  Instantaneous batch size per device = {self._train_batch_size:,}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
         logger.info(f"  Total optimization steps = {max_steps:,}")
-        logger.info(
-            f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}"
-        )
+        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
 
         self.state.epoch = 0
         start_time = time.time()
@@ -453,24 +443,19 @@ def _inner_training_loop(
 
         # Check if continuing training from a checkpoint
         if resume_from_checkpoint is not None and os.path.isfile(
-                os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)):
-            self.state = TrainerState.load_from_json(
-                os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
+        ):
+            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
             epochs_trained = self.state.global_step // num_update_steps_per_epoch
             if not args.ignore_data_skip:
-                steps_trained_in_current_epoch = self.state.global_step % (
-                    num_update_steps_per_epoch)
+                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
                 steps_trained_in_current_epoch *= args.gradient_accumulation_steps
             else:
                 steps_trained_in_current_epoch = 0
 
-            logger.info(
-                "  Continuing training from checkpoint, will skip to saved global_step"
-            )
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
             logger.info(f"  Continuing training from epoch {epochs_trained}")
-            logger.info(
-                f"  Continuing training from global step {self.state.global_step}"
-            )
+            logger.info(f"  Continuing training from global step {self.state.global_step}")
             if not args.ignore_data_skip:
                 if skip_first_batches is None:
                     logger.info(
@@ -478,18 +463,16 @@ def _inner_training_loop(
                         f" {steps_trained_in_current_epoch} batches in the first epoch. If this takes a lot of time,"
                         " you can install the latest version of Accelerate with `pip install -U accelerate`.You can"
                         " also add the `--ignore_data_skip` flag to your launch command, but you will resume the"
-                        " training on data already seen by your model.")
+                        " training on data already seen by your model."
+                    )
                 else:
                     logger.info(
                         f"  Will skip the first {epochs_trained} epochs then the first"
                         f" {steps_trained_in_current_epoch} batches in the first epoch."
                     )
-                if (self.is_local_process_zero() and not args.disable_tqdm and
-                        skip_first_batches is None):
-                    steps_trained_progress_bar = tqdm(
-                        total=steps_trained_in_current_epoch)
-                    steps_trained_progress_bar.set_description(
-                        "Skipping the first batches")
+                if self.is_local_process_zero() and not args.disable_tqdm and skip_first_batches is None:
+                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
+                    steps_trained_progress_bar.set_description("Skipping the first batches")
 
         # Update the references
         self.callback_handler.model = self.model
@@ -501,9 +484,7 @@ def _inner_training_loop(
             # parameter to Train when using DDP.
             self.state.trial_name = self.hp_name(self._trial)
         if trial is not None:
-            assignments = (trial.assignments
-                           if self.hp_search_backend == HPSearchBackend.SIGOPT
-                           else trial)
+            assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
             self.state.trial_params = hp_params(assignments)
         else:
             self.state.trial_params = None
@@ -521,15 +502,14 @@ def _inner_training_loop(
         self._globalstep_last_logged = self.state.global_step
         model.zero_grad()
 
-        self.control = self.callback_handler.on_train_begin(args, self.state,
-                                                            self.control)
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                is_random_sampler = hasattr(
-                    train_dataloader, "sampler") and isinstance(
-                        train_dataloader.sampler, RandomSampler)
+                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
+                    train_dataloader.sampler, RandomSampler
+                )
                 if is_torch_less_than_1_11 or not is_random_sampler:
                     # We just need to begin an iteration to create the randomization of the sampler.
                     # That was before PyTorch 1.11 however...
@@ -542,17 +522,13 @@ def _inner_training_loop(
 
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
-            if isinstance(train_dataloader, DataLoader) and isinstance(
-                    train_dataloader.sampler, DistributedSampler):
+            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
                 train_dataloader.sampler.set_epoch(epoch)
-            elif hasattr(train_dataloader, "dataset") and isinstance(
-                    train_dataloader.dataset, IterableDatasetShard):
+            elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
                 train_dataloader.dataset.set_epoch(epoch)
 
             if is_torch_tpu_available():
-                parallel_loader = pl.ParallelLoader(
-                    train_dataloader,
-                    [args.device]).per_device_loader(args.device)
+                parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device)
                 epoch_iterator = parallel_loader
             else:
                 epoch_iterator = train_dataloader
@@ -561,22 +537,20 @@ def _inner_training_loop(
             if args.past_index >= 0:
                 self._past = None
 
-            steps_in_epoch = (len(epoch_iterator)
-                              if len_dataloader is not None else
-                              args.max_steps * args.gradient_accumulation_steps)
-            self.control = self.callback_handler.on_epoch_begin(
-                args, self.state, self.control)
+            steps_in_epoch = (
+                len(epoch_iterator)
+                if len_dataloader is not None
+                else args.max_steps * args.gradient_accumulation_steps
+            )
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
 
-            if (epoch == epochs_trained and
-                    resume_from_checkpoint is not None and
-                    steps_trained_in_current_epoch == 0):
+            if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
                 self._load_rng_state(resume_from_checkpoint)
 
             rng_to_sync = False
             steps_skipped = 0
             if skip_first_batches is not None and steps_trained_in_current_epoch > 0:
-                epoch_iterator = skip_first_batches(
-                    epoch_iterator, steps_trained_in_current_epoch)
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
                 steps_skipped = steps_trained_in_current_epoch
                 steps_trained_in_current_epoch = 0
                 rng_to_sync = True
@@ -601,18 +575,18 @@ def _inner_training_loop(
                     steps_trained_progress_bar = None
 
                 if step % args.gradient_accumulation_steps == 0:
-                    self.control = self.callback_handler.on_step_begin(
-                        args, self.state, self.control)
+                    self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
 
                 with self.accelerator.accumulate(model):
                     tr_loss_step = self.training_step(model, inputs)
 
-                if (args.logging_nan_inf_filter and
-                        not is_torch_tpu_available() and
-                    (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))):
+                if (
+                    args.logging_nan_inf_filter
+                    and not is_torch_tpu_available()
+                    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
+                ):
                     # if loss is nan or inf simply add the average of previous logged losses
-                    tr_loss += tr_loss / (1 + self.state.global_step -
-                                          self._globalstep_last_logged)
+                    tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
                 else:
                     tr_loss += tr_loss_step
 
@@ -622,9 +596,10 @@ def _inner_training_loop(
                 # the `or` condition of `steps_in_epoch <= args.gradient_accumulation_steps` is not covered
                 # in accelerate
                 if total_batched_samples % args.gradient_accumulation_steps == 0 or (
-                        # last step in epoch but step is always smaller than gradient_accumulation_steps
-                        steps_in_epoch <= args.gradient_accumulation_steps and
-                    (step + 1) == steps_in_epoch):
+                    # last step in epoch but step is always smaller than gradient_accumulation_steps
+                    steps_in_epoch <= args.gradient_accumulation_steps
+                    and (step + 1) == steps_in_epoch
+                ):
                     # Gradient clipping
                     if args.max_grad_norm is not None and args.max_grad_norm > 0:
                         # deepspeed does its own clipping
@@ -633,10 +608,7 @@ def _inner_training_loop(
                             # Reduce gradients first for XLA
                             if is_torch_tpu_available():
                                 gradients = xm._fetch_gradients(self.optimizer)
-                                xm.all_reduce(
-                                    "sum",
-                                    gradients,
-                                    scale=1.0 / xm.xrt_world_size())
+                                xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
                             # AMP: gradients need unscaling
                             self.scaler.unscale_(self.optimizer)
 
@@ -652,11 +624,13 @@ def _inner_training_loop(
                             # Revert to normal clipping otherwise, handling Apex or full precision
                             nn.utils.clip_grad_norm_(
                                 amp.master_params(self.optimizer),
-                                args.max_grad_norm, )
+                                args.max_grad_norm,
+                            )
                         else:
                             self.accelerator.clip_grad_norm_(
                                 model.parameters(),
-                                args.max_grad_norm, )
+                                args.max_grad_norm,
+                            )
 
                     # Optimizer step
                     optimizer_was_run = True
@@ -674,22 +648,20 @@ def _inner_training_loop(
                         optimizer_was_run = scale_before <= scale_after
                     else:
                         self.optimizer.step()
-                        optimizer_was_run = (
-                            not self.accelerator.optimizer_step_was_skipped)
+                        optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
 
                     if optimizer_was_run:
                         # Delay optimizer scheduling until metrics are generated
                         if not isinstance(
-                                self.lr_scheduler,
-                                torch.optim.lr_scheduler.ReduceLROnPlateau, ):
+                            self.lr_scheduler,
+                            torch.optim.lr_scheduler.ReduceLROnPlateau,
+                        ):
                             self.lr_scheduler.step()
 
                     model.zero_grad()
                     self.state.global_step += 1
-                    self.state.epoch = (
-                        epoch + (step + 1 + steps_skipped) / steps_in_epoch)
-                    self.control = self.callback_handler.on_step_end(
-                        args, self.state, self.control)
+                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
+                    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
 
                     self._maybe_log_save_evaluate(
                         tr_loss,
@@ -697,10 +669,10 @@ def _inner_training_loop(
                         trial,
                         epoch,
                         ignore_keys_for_eval,
-                        inputs=inputs, )
+                        inputs=inputs,
+                    )
                 else:
-                    self.control = self.callback_handler.on_substep_end(
-                        args, self.state, self.control)
+                    self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
 
                 if self.control.should_epoch_stop or self.control.should_training_stop:
                     break
@@ -712,15 +684,8 @@ def _inner_training_loop(
                 )
                 self.control.should_training_stop = True
 
-            self.control = self.callback_handler.on_epoch_end(args, self.state,
-                                                              self.control)
-            self._maybe_log_save_evaluate(
-                tr_loss,
-                model,
-                trial,
-                epoch,
-                ignore_keys_for_eval,
-                inputs=inputs)
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+            self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval, inputs=inputs)
 
             if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
                 if is_torch_tpu_available():
@@ -738,9 +703,7 @@ def _inner_training_loop(
             # Clean the state at the end of training
             delattr(self, "_past")
 
-        logger.info(
-            "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
-        )
+        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
         if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
             # Wait for everyone to get here so we are sur the model has been saved by process 0.
             if is_torch_tpu_available():
@@ -760,7 +723,8 @@ def _inner_training_loop(
             "train",
             start_time,
             num_samples=num_train_samples,
-            num_steps=self.state.max_steps, )
+            num_steps=self.state.max_steps,
+        )
         self.store_flos()
         metrics["total_flos"] = self.state.total_flos
         metrics["train_loss"] = train_loss
@@ -772,27 +736,20 @@ def _inner_training_loop(
         self.log(metrics)
 
         run_dir = self._get_output_dir(trial)
-        checkpoints_sorted = self._sorted_checkpoints(
-            use_mtime=False, output_dir=run_dir)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
 
         # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
-        if (self.args.should_save and
-                self.state.best_model_checkpoint is not None and
-                self.args.save_total_limit == 1):
+        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
             for checkpoint in checkpoints_sorted:
                 if checkpoint != self.state.best_model_checkpoint:
-                    logger.info(
-                        f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit"
-                    )
+                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
                     shutil.rmtree(checkpoint)
 
-        self.control = self.callback_handler.on_train_end(args, self.state,
-                                                          self.control)
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
 
         return TrainOutput(self.state.global_step, train_loss, metrics)
 
-    def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
-                                 ignore_keys_for_eval, **kwargs):
+    def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval, **kwargs):
         if self.control.should_log:
             if is_torch_tpu_available():
                 xm.mark_step()
@@ -806,15 +763,15 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
             tr_loss -= tr_loss
 
             logs["loss"] = round(
-                tr_loss_scalar /
-                (self.state.global_step - self._globalstep_last_logged),
-                4, )
+                tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged),
+                4,
+            )
             logs["learning_rate"] = self._get_learning_rate()
             logs["global_step"] = int(self.state.global_step)
 
-            total_train_batch_size = (self.args.train_batch_size *
-                                      self.args.gradient_accumulation_steps *
-                                      self.args.world_size)
+            total_train_batch_size = (
+                self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.world_size
+            )
             num_steps = self.state.global_step - self._globalstep_last_logged
             self.store_flos()
             logs.update(
@@ -822,7 +779,9 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
                     "interval",
                     self._globalstep_last_start_time,
                     num_samples=total_train_batch_size * num_steps,
-                    num_steps=num_steps, ))
+                    num_steps=num_steps,
+                )
+            )
 
             self._total_loss_scalar += tr_loss_scalar
             self._globalstep_last_logged = self.state.global_step
@@ -834,20 +793,19 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
         if self.control.should_evaluate:
             if isinstance(self.eval_dataset, dict):
                 metrics = {}
-                for eval_dataset_name, eval_dataset in self.eval_dataset.items(
-                ):
+                for eval_dataset_name, eval_dataset in self.eval_dataset.items():
                     dataset_metrics = self.evaluate(
                         eval_dataset=eval_dataset,
                         ignore_keys=ignore_keys_for_eval,
-                        metric_key_prefix=f"eval_{eval_dataset_name}", )
+                        metric_key_prefix=f"eval_{eval_dataset_name}",
+                    )
                     metrics.update(dataset_metrics)
             else:
                 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
             self._report_to_hp_search(trial, self.state.global_step, metrics)
 
             # Run delayed LR scheduler now that metrics are populated
-            if isinstance(self.lr_scheduler,
-                          torch.optim.lr_scheduler.ReduceLROnPlateau):
+            if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                 metric_to_check = self.args.metric_for_best_model
                 if not metric_to_check.startswith("eval_"):
                     metric_to_check = f"eval_{metric_to_check}"
@@ -855,17 +813,15 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
 
         if self.control.should_save:
             self._save_checkpoint(model, trial, metrics=metrics)
-            self.control = self.callback_handler.on_save(self.args, self.state,
-                                                         self.control)
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
 
     def log(self, logs: Dict[str, float], **kwargs) -> None:
         if self.state.epoch is not None:
             logs["epoch"] = round(self.state.epoch, 2)
 
-        output = { ** logs, ** {"step": self.state.global_step}}
+        output = {**logs, **{"step": self.state.global_step}}
         self.state.log_history.append(output)
-        self.control = self.callback_handler.on_log(
-            self.args, self.state, self.control, logs, **kwargs)
+        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs, **kwargs)
 
     def _save(self, output_dir=None, state_dict=None):
         output_dir = output_dir if output_dir is not None else self.args.output_dir
@@ -873,34 +829,26 @@ def _save(self, output_dir=None, state_dict=None):
         if self.args.only_save_updated_model:
             unwraped_model = unwrap_model(self.model)
             logger.info(f"Saving unet checkpoint to {output_dir}/unet")
-            unwraped_model.unet.save_pretrained(
-                os.path.join(output_dir, "unet"))
+            unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"))
 
             if unwraped_model.use_ema:
                 logger.info(f"Saving ema unet checkpoint to {output_dir}/unet")
                 with unwraped_model.ema_scope():
-                    unwraped_model.unet.save_pretrained(
-                        os.path.join(output_dir, "unet"), variant="ema")
+                    unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema")
 
             if unwraped_model.train_text_encoder:
-                logger.info(
-                    f"Saving text encoder checkpoint to {output_dir}/text_encoder"
-                )
-                unwraped_model.text_encoder.save_pretrained(
-                    os.path.join(output_dir, "text_encoder"))
+                logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder")
+                unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder"))
         else:
             logger.info(f"Saving model checkpoint to {output_dir}")
             if state_dict is None:
                 state_dict = self.model.state_dict()
 
-            logger.info(
-                "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
-            )
+            logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
             if self.args.save_safetensors:
                 import safetensors
 
-                safetensors.torch.save_file(
-                    state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
+                safetensors.torch.save_file(state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
             else:
                 torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
 
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py
index 23507e6820cf0..6cbf69c57a1d4 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py
@@ -45,8 +45,7 @@ def parse_src(filename):
         elif data_source == "laion_aes":
             text_json = json.loads(vec[2])
             img_b64 = vec[5]
-            caption = text_json.get("caption_en",
-                                    text_json.get("blip_caption_en", ""))
+            caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
         else:
             _, captions, _, _, _, img_b64 = vec[:6]
             caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
@@ -63,24 +62,27 @@ def parse_src(filename):
 
 class TextImagePair(IterableDataset):
     def __init__(
-            self,
-            file_list,
-            size,
-            num_records,
-            image_processing=None,
-            buffer_size=1000,
-            shuffle_every_n_samples=5,
-            interpolation="lanczos",
-            tokenizer=None, ):
+        self,
+        file_list,
+        size,
+        num_records,
+        image_processing=None,
+        buffer_size=1000,
+        shuffle_every_n_samples=5,
+        interpolation="lanczos",
+        tokenizer=None,
+    ):
         self.size = size
         assert interpolation == "lanczos"
         if image_processing is None:
-            self.image_processing = transforms.Compose([
-                transforms.Resize(int(size / 0.9), InterpolationMode.LANCZOS),
-                transforms.RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize(0.5, 0.5),
-            ])
+            self.image_processing = transforms.Compose(
+                [
+                    transforms.Resize(int(size / 0.9), InterpolationMode.LANCZOS),
+                    transforms.RandomCrop(size),
+                    transforms.ToTensor(),
+                    transforms.Normalize(0.5, 0.5),
+                ]
+            )
         else:
             self.image_processing = image_processing
         self.text_processing = lambda caption: tokenizer(
@@ -88,7 +90,8 @@ def __init__(
             padding="max_length",
             truncation=True,
             max_length=tokenizer.model_max_length,
-            return_tensors="pt", ).input_ids[0]
+            return_tensors="pt",
+        ).input_ids[0]
         self.file_list = []
         file_weights = []
         with open(file_list, "r") as f:
@@ -109,19 +112,14 @@ def __init__(
             file_weights = file_weights / file_weight_sum
             print(f"sample weights of files: {file_weights}")
             self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate(
-                [[0.0], self.file_weights_cumsum])
+            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
         else:
             print("sample each file list with same probabiliy")
             self.file_weights_cumsum = None
 
         self.num_records = num_records
-        self.file_ids = [
-            np.arange(len(filelist)) for filelist in self.file_list
-        ]
-        print(
-            f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
-        )
+        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
         self.buffer_size = buffer_size
         self.shuffle_every_n_samples = shuffle_every_n_samples
 
@@ -130,9 +128,7 @@ def sample_loader(self, file_ids, filenames):
             random.shuffle(file_ids)
             for i in file_ids:
                 filename = filenames[i].strip("\n")
-                with gzip.open(filename,
-                               "rb") if filename.endswith(".gz") else open(
-                                   filename, "rb") as f:
+                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
                     # retry = 0
                     while True:
                         line = f.readline()
@@ -158,19 +154,14 @@ def sample_loader(self, file_ids, filenames):
                             if w < self.size or h < self.size:
                                 continue
                             yield {
-                                "pixel_values":
-                                self.image_processing(data["image"]),
-                                "input_ids":
-                                self.text_processing(data["caption"]),
+                                "pixel_values": self.image_processing(data["image"]),
+                                "input_ids": self.text_processing(data["caption"]),
                             }
 
     def random_load_from_multi_dataset(self):
-        print(
-            f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
-        )
+        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
         sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
-            for i in range(len(self.file_ids))
+            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
         ]
 
         while True:
@@ -179,8 +170,7 @@ def random_load_from_multi_dataset(self):
             else:
                 rand_num = random.random()
                 for i in range(len(self.file_list)):
-                    if (self.file_weights_cumsum[i] <= rand_num <
-                            self.file_weights_cumsum[i + 1]):
+                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
                         break
                 sample_loader = sample_loader_per_dataset[i]
                 # debug
diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py
index 0f6ad8874e14d..668ad3aae54a9 100644
--- a/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py
@@ -13,16 +13,20 @@
 # limitations under the License.
 import os
 
-import torch
 import transformers
-from sd import (SDDataArguments, SDModelArguments, SDTrainingArguments,
-                StableDiffusionModel, StableDiffusionTrainer, TextImagePair)
+from sd import (
+    SDDataArguments,
+    SDModelArguments,
+    SDTrainingArguments,
+    StableDiffusionModel,
+    StableDiffusionTrainer,
+    TextImagePair,
+)
 from transformers.trainer import get_last_checkpoint, set_seed
 
 
 def main():
-    parser = transformers.HfArgumentParser(
-        (SDModelArguments, SDDataArguments, SDTrainingArguments))
+    parser = transformers.HfArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
     log_level = training_args.get_process_log_level()
@@ -37,16 +41,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             print(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -65,13 +67,15 @@ def main():
         buffer_size=data_args.buffer_size,
         shuffle_every_n_samples=data_args.shuffle_every_n_samples,
         interpolation=data_args.interpolation,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     trainer = StableDiffusionTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     checkpoint = None
     if training_args.resume_from_checkpoint is not None:
diff --git a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
index 7e0b5e6488085..4f4cd63ceb164 100644
--- a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
+++ b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
@@ -17,13 +17,18 @@
 import paddle
 from paddlenlp.trainer import PdArgumentParser, get_last_checkpoint, set_seed
 from paddlenlp.utils.log import logger
-from sd import (SDDataArguments, SDModelArguments, SDTrainingArguments,
-                StableDiffusionModel, StableDiffusionTrainer, TextImagePair)
+from sd import (
+    SDDataArguments,
+    SDModelArguments,
+    SDTrainingArguments,
+    StableDiffusionModel,
+    StableDiffusionTrainer,
+    TextImagePair,
+)
 
 
 def main():
-    parser = PdArgumentParser(
-        (SDModelArguments, SDDataArguments, SDTrainingArguments))
+    parser = PdArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     training_args.print_config(model_args, "Model")
     training_args.print_config(data_args, "Data")
@@ -32,16 +37,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -55,19 +58,16 @@ def main():
     model.set_ema(training_args.use_ema)
 
     if training_args.to_static:
-        input_ids = paddle.static.InputSpec(
-            name="input_ids",
-            shape=[-1, model_args.model_max_length],
-            dtype="int64")
+        input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64")
         pixel_values = paddle.static.InputSpec(
             name="pixel_values",
             shape=[-1, 3, training_args.resolution, training_args.resolution],
-            dtype="float32", )
+            dtype="float32",
+        )
         specs = [input_ids, pixel_values]
         paddle.jit.ignore_module([os])
         model = paddle.jit.to_static(model, input_spec=specs)
-        logger.info("Successfully to apply @to_static with specs: {}".format(
-            specs))
+        logger.info("Successfully to apply @to_static with specs: {}".format(specs))
 
     train_dataset = TextImagePair(
         file_list=data_args.file_list,
@@ -76,18 +76,19 @@ def main():
         buffer_size=data_args.buffer_size,
         shuffle_every_n_samples=data_args.shuffle_every_n_samples,
         interpolation=data_args.interpolation,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     trainer = StableDiffusionTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     if model_args.train_text_encoder:
         if training_args.text_encoder_learning_rate == training_args.unet_learning_rate:
-            params_to_train = itertools.chain(model.text_encoder.parameters(),
-                                              model.unet.parameters())
+            params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters())
         else:
             # overwrite default learning rate with 1.0
             training_args.learning_rate = 1.0
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
index aee1fac6ac23b..857c78b0ae1a9 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
@@ -24,60 +24,46 @@ class ModelArguments:
 
     adapter_config_file: Optional[str] = field(
         default="./config/openpose_adapter.json",
-        metadata={"help": "adapter_config_file"}, )
-    vae_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "pretrained_vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "unet_encoder_name_or_path"})
+        metadata={"help": "adapter_config_file"},
+    )
+    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"})
+    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
+    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"})
     tokenizer_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as model_name"
-        }, )
-    model_max_length: Optional[int] = field(
-        default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    num_inference_steps: Optional[int] = field(
-        default=50, metadata={"help": "num_inference_steps"})
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+    num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"})
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     pretrained_model_name_or_path: str = field(
         default="runwayml/stable-diffusion-v1-5",
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
     pretrained_adapter_name_or_path: str = field(
         default=None,
         metadata={
-            "help":
-            "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training."
-        }, )
-    image_logging_steps: Optional[int] = field(
-        default=1000, metadata={"help": "Log image every X steps."})
-    use_paddle_conv_init: bool = field(
-        default=False,
-        metadata={"help": "Whether or not use paddle conv2d init."})
-    is_ldmbert: bool = field(
-        default=False, metadata={"help": "Whether to use ldmbert."})
+            "help": "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training."
+        },
+    )
+    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
+    use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."})
+    is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable_xformers_memory_efficient_attention."})
-    control_type: Optional[str] = field(
-        default="canny", metadata={"help": "The type of control"})
+        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+    )
+    control_type: Optional[str] = field(default="canny", metadata={"help": "The type of control"})
     latents_path: str = field(
         default=None,
-        metadata={"help": "Path to latents, used for alignment."}, )
-    random_alignment: bool = field(
-        default=False, metadata={"help": "Whether to align random."})
+        metadata={"help": "Path to latents, used for alignment."},
+    )
+    random_alignment: bool = field(default=False, metadata={"help": "Whether to align random."})
     timestep_sample_schedule: Optional[str] = field(
         default="linear",
         metadata={
-            "help":
-            "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']."
-        }, )
+            "help": "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']."
+        },
+    )
 
 
 @dataclass
@@ -88,26 +74,29 @@ class DataArguments:
 
     file_list: str = field(
         default="./data/filelist/train.filelist.list",
-        metadata={"help": "The name of the file_list."}, )
+        metadata={"help": "The name of the file_list."},
+    )
     resolution: int = field(
         default=512,
         metadata={
-            "help":
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        }, )
+            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+        },
+    )
     num_records: int = field(default=10000000, metadata={"help": "num_records"})
     buffer_size: int = field(
         default=100,
-        metadata={"help": "Buffer size"}, )
+        metadata={"help": "Buffer size"},
+    )
     shuffle_every_n_samples: int = field(
         default=5,
-        metadata={"help": "shuffle_every_n_samples."}, )
+        metadata={"help": "shuffle_every_n_samples."},
+    )
     data_format: str = field(
         default="default",
         metadata={
-            "help":
-            "The data format, must be 'default' or 'img2img'.  The img2img format directly provides control image."
-        }, )
+            "help": "The data format, must be 'default' or 'img2img'.  The img2img format directly provides control image."
+        },
+    )
 
 
 @dataclass
@@ -116,45 +105,28 @@ class GenerateArguments:
     Arguments pertaining to specify the model generation settings.
     """
 
-    use_controlnet: bool = field(
-        default=False, metadata={"help": "Whether or not use text condition"})
-    use_dumpy_dataset: bool = field(
-        default=False, metadata={"help": "Whether or not use dummpy dataset"})
-    adapter_model_name_or_path: str = field(
-        default=None, metadata={"help": "adapter model name or path."})
-    sd_model_name_or_path: str = field(
-        default=None, metadata={"help": "sd model name or path."})
-    file: str = field(
-        default="data/test.openpose.filelist", metadata={"help": "eval file."})
+    use_controlnet: bool = field(default=False, metadata={"help": "Whether or not use text condition"})
+    use_dumpy_dataset: bool = field(default=False, metadata={"help": "Whether or not use dummpy dataset"})
+    adapter_model_name_or_path: str = field(default=None, metadata={"help": "adapter model name or path."})
+    sd_model_name_or_path: str = field(default=None, metadata={"help": "sd model name or path."})
+    file: str = field(default="data/test.openpose.filelist", metadata={"help": "eval file."})
     seed: int = field(default=42, metadata={"help": "random seed."})
     scheduler_type: str = field(
         default="ddim",
-        metadata={
-            "help":
-            "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']"
-        }, )
+        metadata={"help": "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']"},
+    )
     device: str = field(default="gpu", metadata={"help": "device"})
     batch_size: int = field(default=16, metadata={"help": "batch_size"})
-    num_inference_steps: int = field(
-        default=50, metadata={"help": "num_inference_steps"})
-    save_path: str = field(
-        default="output/adapter/",
-        metadata={"help": "Path to the output file."})
-    guidance_scales: str = field(
-        default_factory=lambda: [5, 7, 9],
-        metadata={"help": "guidance_scales list."})
+    num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
+    save_path: str = field(default="output/adapter/", metadata={"help": "Path to the output file."})
+    guidance_scales: str = field(default_factory=lambda: [5, 7, 9], metadata={"help": "guidance_scales list."})
     height: int = field(default=512, metadata={"help": "height."})
     width: int = field(default=512, metadata={"help": "width."})
-    max_generation_limits: int = field(
-        default=1000, metadata={"help": "max generation limits."})
-    use_text_cond: bool = field(
-        default=True, metadata={"help": "Whether or not use text condition"})
+    max_generation_limits: int = field(default=1000, metadata={"help": "max generation limits."})
+    use_text_cond: bool = field(default=True, metadata={"help": "Whether or not use text condition"})
     use_default_neg_text_cond: bool = field(
         default=True,
-        metadata={
-            "help": "Whether or not use default negative text condition"
-        }, )
-    generate_data_format: str = field(
-        default="img2img", metadata={"help": "Generate data format."})
-    generate_control_image_processor_type: str = field(
-        default="openpose", metadata={"help": "Generate data format."})
+        metadata={"help": "Whether or not use default negative text condition"},
+    )
+    generate_data_format: str = field(default="img2img", metadata={"help": "Generate data format."})
+    generate_control_image_processor_type: str = field(default="openpose", metadata={"help": "Generate data format."})
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
index a6151bf307d1c..b7ff85077b613 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
@@ -20,8 +20,11 @@
 import paddle.amp.auto_cast as autocast
 from paddle.io import DataLoader
 from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK,
-                                            VisualDLCallback, rewrite_logs)
+from paddlenlp.trainer.integrations import (
+    INTEGRATION_TO_CALLBACK,
+    VisualDLCallback,
+    rewrite_logs,
+)
 from paddlenlp.utils.log import logger
 
 from ppdiffusers.training_utils import unwrap_model
@@ -40,19 +43,17 @@ def autocast_smart_context_manager(self, args):
                     "c_softmax_with_cross_entropy",
                 ],
                 level=args.fp16_opt_level,
-                dtype=amp_dtype, )
+                dtype=amp_dtype,
+            )
         else:
-            ctx_manager = (contextlib.nullcontext()
-                           if sys.version_info >= (3, 7) else
-                           contextlib.suppress())
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
         return ctx_manager
 
     def on_step_end(self, args, state, control, model=None, **kwargs):
         if hasattr(model, "on_train_batch_end"):
             model.on_train_batch_end()
-        if (args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
             control.should_log = True
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -63,20 +64,22 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         inputs = kwargs.get("inputs", None)
         model = kwargs.get("model", None)
         image_logs = {}
-        if (inputs is not None and model is not None and
-                args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if (
+            inputs is not None
+            and model is not None
+            and args.image_logging_steps > 0
+            and state.global_step % args.image_logging_steps == 0
+        ):
             with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.decode_image(
-                    pixel_values=inputs["pixel_values"])
-                image_logs["control"] = model.decode_control_image(
-                    adapter_cond=inputs["adapter_cond"])
+                image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
+                image_logs["control"] = model.decode_control_image(adapter_cond=inputs["adapter_cond"])
                 image_logs["ddim-samples-9.0"] = model.log_image(
                     input_ids=inputs["input_ids"],
                     adapter_cond=inputs["adapter_cond"],
                     guidance_scale=9.0,
                     height=args.resolution,
-                    width=args.resolution, )
+                    width=args.resolution,
+                )
 
         if self.vdl_writer is None:
             self._init_summary_writer(args)
@@ -91,11 +94,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         "Trainer is attempting to log a value of "
                         f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                         "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute.")
+                        "is incorrect so we dropped this attribute."
+                    )
             # log images
             for k, v in image_logs.items():
-                self.vdl_writer.add_image(
-                    k, v, state.global_step, dataformats="NHWC")
+                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
             self.vdl_writer.flush()
 
 
@@ -104,12 +107,9 @@ def on_log(self, args, state, control, logs=None, **kwargs):
 
 
 def collate_fn(examples):
-    pixel_values = paddle.stack(
-        [paddle.to_tensor(example["pixel_values"]) for example in examples])
-    input_ids = paddle.stack(
-        [paddle.to_tensor(example["input_ids"]) for example in examples])
-    adapter_cond = paddle.stack(
-        [paddle.to_tensor(example["adapter_cond"]) for example in examples])
+    pixel_values = paddle.stack([paddle.to_tensor(example["pixel_values"]) for example in examples])
+    input_ids = paddle.stack([paddle.to_tensor(example["input_ids"]) for example in examples])
+    adapter_cond = paddle.stack([paddle.to_tensor(example["adapter_cond"]) for example in examples])
 
     batch = {
         "input_ids": input_ids,
@@ -133,18 +133,16 @@ def get_train_dataloader(self):
                 batch_size=self.args.train_batch_size,
                 num_workers=self.args.dataloader_num_workers,
                 worker_init_fn=worker_init_fn,
-                collate_fn=collate_fn, )
+                collate_fn=collate_fn,
+            )
         else:
             return super().get_train_dataloader()
 
-    def _save(self,
-              output_dir=None,
-              state_dict=None,
-              merge_tensor_parallel=False):
+    def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
         super()._save(
             output_dir=output_dir,
             state_dict=state_dict,
-            merge_tensor_parallel=merge_tensor_parallel, )
+            merge_tensor_parallel=merge_tensor_parallel,
+        )
         output_dir = output_dir if output_dir is not None else self.args.output_dir
-        unwrap_model(self.model).adapter.save_pretrained(
-            os.path.join(output_dir, "adapter"))
+        unwrap_model(self.model).adapter.save_pretrained(os.path.join(output_dir, "adapter"))
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
index 5dd1dec076803..e179df14c8f40 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
@@ -45,8 +45,7 @@ def process_data(line, filename, data_format):
             control_image_b64str = None
 
         caption = ""
-        caption += text_json.get("caption_en",
-                                 text_json.get("blip_caption_en", ""))
+        caption += text_json.get("caption_en", text_json.get("blip_caption_en", ""))
         if caption != "":
             image_base64 = image_b64str
         else:
@@ -65,11 +64,9 @@ def parse_line(line, filename, data_format="default"):
         res = process_data(line, filename, data_format)
         if res is not None:
             image_base64, caption, _id, control_image_base64 = res
-            image = Image.open(io.BytesIO(base64.b64decode(
-                image_base64))).convert("RGB")
+            image = Image.open(io.BytesIO(base64.b64decode(image_base64))).convert("RGB")
             if control_image_base64 is not None:
-                image_extract = io.BytesIO(
-                    base64.b64decode(control_image_base64))
+                image_extract = io.BytesIO(base64.b64decode(control_image_base64))
                 control_image = Image.open(image_extract).convert("RGB")
 
                 control_image = control_image.resize(image.size)
@@ -83,7 +80,8 @@ def parse_line(line, filename, data_format="default"):
                     (image.size[0] - image.size[1]) // 2,
                     0,
                     (image.size[0] + image.size[1]) // 2,
-                    image.size[1], )
+                    image.size[1],
+                )
             image = image.crop(crop_size)
             if control_image is not None:
                 control_image = control_image.crop(crop_size)
@@ -95,7 +93,8 @@ def parse_line(line, filename, data_format="default"):
                 image=image,
                 caption=caption,
                 _id=_id,
-                control_image=control_image, )
+                control_image=control_image,
+            )
         else:
             return None
     except Exception as e:
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
index 91969cb548b8c..74b3617fb060b 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
@@ -23,11 +23,12 @@
 
 class Fill50kDataset(Dataset):
     def __init__(
-            self,
-            tokenizer,
-            file_path="./fill50k",
-            do_image_processing=True,
-            do_text_processing=True, ):
+        self,
+        tokenizer,
+        file_path="./fill50k",
+        do_image_processing=True,
+        do_text_processing=True,
+    ):
         self.tokenizer = tokenizer
         self.image_list = []
         self.label_list = []
@@ -47,7 +48,8 @@ def __init__(
                 padding="max_length",
                 truncation=True,
                 max_length=tokenizer.model_max_length,
-                return_tensors="np", ).input_ids[0]
+                return_tensors="np",
+            ).input_ids[0]
         self.do_image_processing = do_image_processing
         self.do_text_processing = do_text_processing
 
@@ -67,13 +69,11 @@ def __getitem__(self, idx):
         if self.do_image_processing:
             # Normalize source images to [0, 1].
             source = source.astype(np.float32) / 255.0
-            source = paddle.to_tensor(
-                source.transpose([2, 0, 1]), dtype=paddle.float32)
+            source = paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32)
 
             # Normalize target images to [-1, 1].
             target = (target.astype(np.float32) / 127.5) - 1.0
-            target = paddle.to_tensor(
-                target.transpose([2, 0, 1]), dtype=paddle.float32)
+            target = paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32)
 
         if self.text_processing and self.do_text_processing:
             input_ids = self.text_processing(prompt)
@@ -84,4 +84,5 @@ def __getitem__(self, idx):
         return dict(
             input_ids=input_ids,
             pixel_values=target,
-            adapter_cond=source, )
+            adapter_cond=source,
+        )
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/model.py b/ppdiffusers/examples/t2i-adapter/adapter/model.py
index 2e31f0262f56b..1c9d6f678955e 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/model.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/model.py
@@ -24,9 +24,16 @@
 from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
 from paddlenlp.utils.log import logger
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         LDMBertModel, T2IAdapter, UNet2DConditionModel,
-                         is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    LDMBertModel,
+    T2IAdapter,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
+
 # from ppdiffusers.initializer import reset_initialized_parameter
 from ppdiffusers.models.ema import LitEma
 from ppdiffusers.training_utils import freeze_params
@@ -52,18 +59,20 @@ def __init__(self, model_args):
         # init tokenizer
         tokenizer_name_or_path = (
             model_args.tokenizer_name
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+        )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path,
-            model_max_length=model_args.model_max_length)
+            tokenizer_name_or_path, model_max_length=model_args.model_max_length
+        )
 
         vae_name = "vqvae" if model_args.is_ldmbert else "vae"
         # init vae
         vae_name_or_path = (
             model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, vae_name))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, vae_name)
+        )
 
         self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
         freeze_params(self.vae.parameters())
@@ -72,27 +81,27 @@ def __init__(self, model_args):
         if model_args.is_ldmbert:
             text_encoder_name_or_path = (
                 model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None else
-                os.path.join(model_args.pretrained_model_name_or_path, "bert"))
+                if model_args.pretrained_model_name_or_path is None
+                else os.path.join(model_args.pretrained_model_name_or_path, "bert")
+            )
             # init text_encoder
-            self.text_encoder = LDMBertModel.from_pretrained(
-                text_encoder_name_or_path)
+            self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path)
         else:
             text_encoder_name_or_path = (
                 model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None else
-                os.path.join(model_args.pretrained_model_name_or_path,
-                             "text_encoder"))
-            self.text_encoder = CLIPTextModel.from_pretrained(
-                text_encoder_name_or_path)
+                if model_args.pretrained_model_name_or_path is None
+                else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+            )
+            self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
 
         freeze_params(self.text_encoder.parameters())
         logger.info("Freeze text_encoder parameters!")
 
         unet_name_or_path = (
             model_args.unet_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+        )
 
         self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
 
@@ -100,44 +109,43 @@ def __init__(self, model_args):
         logger.info("Freeze unet parameters!")
 
         if model_args.pretrained_adapter_name_or_path:
-            self.adapter = T2IAdapter.from_pretrained(
-                model_args.pretrained_adapter_name_or_path)
+            self.adapter = T2IAdapter.from_pretrained(model_args.pretrained_adapter_name_or_path)
         else:
-            self.adapter = T2IAdapter(
-                **read_json(model_args.adapter_config_file))
+            self.adapter = T2IAdapter(**read_json(model_args.adapter_config_file))
 
         self.noise_scheduler = DDPMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
-            num_train_timesteps=1000, )
+            num_train_timesteps=1000,
+        )
         self.eval_scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
         self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
         self.use_ema = model_args.use_ema
         if self.use_ema:
             self.model_ema = LitEma(self.adapter)
         self.adapter_conditioning_scale = 1.0
 
-        if (model_args.enable_xformers_memory_efficient_attention and
-                is_ppxformers_available()):
+        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
             try:
                 self.unet.enable_xformers_memory_efficient_attention()
                 self.adapter.enable_xformers_memory_efficient_attention()
             except Exception as e:
                 logger.warn(
                     "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}")
+                    f" correctly and a GPU is available: {e}"
+                )
         self.use_preconfig_latents = False
         if model_args.latents_path:
             self.use_preconfig_latents = True
-            self.register_buffer("preconfig_latents",
-                                 paddle.load(model_args.latents_path))
+            self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path))
         self.random_alignment = model_args.random_alignment
         self.timestep_sample_schedule = model_args.timestep_sample_schedule
 
@@ -162,36 +170,29 @@ def on_train_batch_end(self):
 
     def get_time_with_schedule(self, timestep_sample_schedule, bs):
         if timestep_sample_schedule == "linear":
-            t = paddle.randint(
-                low=0,
-                high=self.noise_scheduler.num_train_timesteps,
-                shape=(bs, )).astype(dtype="int64")
+            t = paddle.randint(low=0, high=self.noise_scheduler.num_train_timesteps, shape=(bs,)).astype(dtype="int64")
         elif timestep_sample_schedule == "cosine":
-            t = paddle.rand(shape=(bs, ))
-            t = paddle.cos(x=np.pi / 2.0 *
-                           t) * self.noise_scheduler.num_train_timesteps
+            t = paddle.rand(shape=(bs,))
+            t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps
             t = t.astype(dtype="int64")
         elif timestep_sample_schedule == "cubic":
-            t = paddle.rand(shape=(bs, ))
+            t = paddle.rand(shape=(bs,))
             t = (1 - t**3) * self.noise_scheduler.num_train_timesteps
             t = t.astype(dtype="int64")
         else:
             raise NotImplementedError
-        t = paddle.clip(
-            x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
+        t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
         return t
 
-    def get_time_with_schedule_and_numpy_generator(
-            self, timestep_sample_schedule, bs):
+    def get_time_with_schedule_and_numpy_generator(self, timestep_sample_schedule, bs):
         if timestep_sample_schedule == "linear":
             t = paddle.to_tensor(
-                generator.randint(
-                    0, self.noise_scheduler.num_train_timesteps, size=(bs, )),
-                dtype="int64", )
+                generator.randint(0, self.noise_scheduler.num_train_timesteps, size=(bs,)),
+                dtype="int64",
+            )
         elif timestep_sample_schedule == "cosine":
             t = paddle.to_tensor(generator.rand(bs))
-            t = paddle.cos(x=np.pi / 2.0 *
-                           t) * self.noise_scheduler.num_train_timesteps
+            t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps
             t = t.astype(dtype="int64")
         elif timestep_sample_schedule == "cubic":
             t = paddle.to_tensor(generator.rand(bs))
@@ -199,18 +200,12 @@ def get_time_with_schedule_and_numpy_generator(
             t = t.astype(dtype="int64")
         else:
             raise NotImplementedError
-        t = paddle.clip(
-            x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
+        t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
         return t
 
-    def forward(self,
-                input_ids=None,
-                pixel_values=None,
-                adapter_cond=None,
-                **kwargs):
+    def forward(self, input_ids=None, pixel_values=None, adapter_cond=None, **kwargs):
         with paddle.no_grad():
-            adapter_cond = self.control_image_processor.process_model_forward(
-                adapter_cond)
+            adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)
         self.train()
         with paddle.amp.auto_cast(enable=False):
             with paddle.no_grad():
@@ -220,15 +215,13 @@ def forward(self,
                 latents = latents * 0.18215
                 if self.random_alignment:
                     timesteps = self.get_time_with_schedule_and_numpy_generator(
-                        self.timestep_sample_schedule, latents.shape[0])
-                    noise = paddle.to_tensor(
-                        generator.randn(*latents.shape), dtype="float32")
+                        self.timestep_sample_schedule, latents.shape[0]
+                    )
+                    noise = paddle.to_tensor(generator.randn(*latents.shape), dtype="float32")
                 else:
-                    timesteps = self.get_time_with_schedule(
-                        self.timestep_sample_schedule, latents.shape[0])
+                    timesteps = self.get_time_with_schedule(self.timestep_sample_schedule, latents.shape[0])
                     noise = paddle.randn(latents.shape)
-                noisy_latents = self.noise_scheduler.add_noise(latents, noise,
-                                                               timesteps)
+                noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
                 encoder_hidden_states = self.text_encoder(input_ids)[0]
         adapter_state = self.adapter(adapter_cond)
 
@@ -240,7 +233,8 @@ def forward(self,
             noisy_latents,
             timestep=timesteps,
             encoder_hidden_states=encoder_hidden_states,
-            down_block_additional_residuals=adapter_state, ).sample
+            down_block_additional_residuals=adapter_state,
+        ).sample
         loss = F.mse_loss(noise_pred, noise, reduction="mean")
         return loss
 
@@ -257,29 +251,25 @@ def decode_image(self, pixel_values=None, **kwargs):
 
     @paddle.no_grad()
     def decode_control_image(self, adapter_cond=None, **kwargs):
-        adapter_cond = self.control_image_processor.process_model_forward(
-            adapter_cond)  # (0, 1)
-        return (255 * (adapter_cond.transpose(
-            [0, 2, 3, 1])).cast("float32").numpy().round())
+        adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)  # (0, 1)
+        return 255 * (adapter_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round()
 
     @paddle.no_grad()
     def log_image(
-            self,
-            input_ids=None,
-            adapter_cond=None,
-            height=512,
-            width=512,
-            eta=0.0,
-            guidance_scale=9,
-            **kwargs, ):
-        adapter_cond = self.control_image_processor.process_model_forward(
-            adapter_cond)
+        self,
+        input_ids=None,
+        adapter_cond=None,
+        height=512,
+        width=512,
+        eta=0.0,
+        guidance_scale=9,
+        **kwargs,
+    ):
+        adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log 8 image
             if input_ids.shape[0] > 4:
                 input_ids = input_ids[:4]
@@ -293,33 +283,28 @@ def log_image(
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
                 uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings], axis=0)
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
             if self.use_preconfig_latents:
                 latents = self.preconfig_latents
             else:
-                latents = paddle.randn(
-                    (input_ids.shape[0], self.unet.in_channels, height // 8,
-                     width // 8))
+                latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
             # ddim donot use this
             latents = latents * self.eval_scheduler.init_noise_sigma
 
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
 
             for t in self.eval_scheduler.timesteps:
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
 
                 # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
 
                 # Adapter predict the noise residual
                 adapter_state = self.adapter(adapter_cond)
@@ -334,19 +319,16 @@ def log_image(
                     latent_model_input,
                     t,
                     encoder_hidden_states=text_embeddings,
-                    down_block_additional_residuals=[
-                        state.clone() for state in adapter_state
-                    ], ).sample
+                    down_block_additional_residuals=[state.clone() for state in adapter_state],
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             latents = 1 / 0.18215 * latents
             image = self.vae.decode(latents).sample
@@ -358,7 +340,6 @@ def set_recompute(self, value=False):
         def fn(layer):
             if hasattr(layer, "gradient_checkpointing"):
                 layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute",
-                      layer.gradient_checkpointing)
+                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
 
         self.adapter.apply(fn)
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
index a523be48b4663..a3d1481c39807 100644
--- a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
@@ -43,25 +43,28 @@ def _get_param(self, img, output_size):
 
 class TextImagePair(IterableDataset):
     def __init__(
-            self,
-            file_list,
-            size,
-            num_records,
-            image_processing=None,
-            buffer_size=1000,
-            shuffle_every_n_samples=5,
-            interpolation="lanczos",
-            tokenizer=None,
-            control_image_processor=None,
-            data_format="default",
-            do_image_processing=True, ):
+        self,
+        file_list,
+        size,
+        num_records,
+        image_processing=None,
+        buffer_size=1000,
+        shuffle_every_n_samples=5,
+        interpolation="lanczos",
+        tokenizer=None,
+        control_image_processor=None,
+        data_format="default",
+        do_image_processing=True,
+    ):
         self.size = size
         self.resize_transform = transforms.Resize(int(size), interpolation)
         if image_processing is None:
-            self.image_processing = transforms.Compose([
-                transforms.ToTensor(),  # (0 ~ 1)
-                transforms.Normalize(0.5, 0.5),  # (-1 ~ 1)
-            ])
+            self.image_processing = transforms.Compose(
+                [
+                    transforms.ToTensor(),  # (0 ~ 1)
+                    transforms.Normalize(0.5, 0.5),  # (-1 ~ 1)
+                ]
+            )
         else:
             self.image_processing = image_processing
         if tokenizer is not None:
@@ -70,7 +73,8 @@ def __init__(
                 padding="max_length",
                 truncation=True,
                 max_length=tokenizer.model_max_length,
-                return_tensors="np", ).input_ids[0]
+                return_tensors="np",
+            ).input_ids[0]
         else:
             self.text_processing = None
 
@@ -99,19 +103,14 @@ def __init__(
             file_weights = file_weights / file_weight_sum
             print(f"sample weights of files: {file_weights}")
             self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate(
-                [[0.0], self.file_weights_cumsum])
+            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
         else:
             print("sample each file list with same probabiliy")
             self.file_weights_cumsum = None
 
         self.num_records = num_records
-        self.file_ids = [
-            np.arange(len(filelist)) for filelist in self.file_list
-        ]
-        print(
-            f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
-        )
+        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
         self.buffer_size = buffer_size
         self.shuffle_every_n_samples = shuffle_every_n_samples
         self.data_format = data_format
@@ -122,9 +121,7 @@ def sample_loader(self, file_ids, filenames):
             random.shuffle(file_ids)
             for i in file_ids:
                 filename = filenames[i].strip("\n")
-                with gzip.open(filename,
-                               "rb") if filename.endswith(".gz") else open(
-                                   filename, "rb") as f:
+                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
                     # retry = 0
                     while True:
                         line = f.readline()
@@ -150,31 +147,26 @@ def sample_loader(self, file_ids, filenames):
 
                             control_image = data["control_image"]
                             if control_image is not None:
-                                control_image = self.resize_transform(
-                                    control_image)
+                                control_image = self.resize_transform(control_image)
                             else:
                                 control_image = image
                             out = {
-                                "pixel_values":
-                                self.image_processing(image).numpy()
-                                if self.do_image_processing else image,
-                                "input_ids":
-                                self.text_processing(data["caption"])
-                                if self.text_processing else data["caption"],
-                                "adapter_cond":
-                                self.control_image_processor.process_data_load(
-                                    control_image).numpy() if
-                                self.control_image_processor else control_image,
+                                "pixel_values": self.image_processing(image).numpy()
+                                if self.do_image_processing
+                                else image,
+                                "input_ids": self.text_processing(data["caption"])
+                                if self.text_processing
+                                else data["caption"],
+                                "adapter_cond": self.control_image_processor.process_data_load(control_image).numpy()
+                                if self.control_image_processor
+                                else control_image,
                             }
                             yield out
 
     def random_load_from_multi_dataset(self):
-        print(
-            f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
-        )
+        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
         sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
-            for i in range(len(self.file_ids))
+            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
         ]
 
         while True:
@@ -183,8 +175,7 @@ def random_load_from_multi_dataset(self):
             else:
                 rand_num = random.random()
                 for i in range(len(self.file_list)):
-                    if (self.file_weights_cumsum[i] <= rand_num <
-                            self.file_weights_cumsum[i + 1]):
+                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
                         break
                 sample_loader = sample_loader_per_dataset[i]
             yield next(sample_loader)
diff --git a/ppdiffusers/examples/t2i-adapter/generate.py b/ppdiffusers/examples/t2i-adapter/generate.py
index b4afa6609c6eb..1197dc715e704 100644
--- a/ppdiffusers/examples/t2i-adapter/generate.py
+++ b/ppdiffusers/examples/t2i-adapter/generate.py
@@ -17,22 +17,28 @@
 
 import numpy as np
 import paddle
-from adapter import (DataArguments, Fill50kDataset, GenerateArguments,
-                     TextImagePair)
+from adapter import DataArguments, Fill50kDataset, GenerateArguments, TextImagePair
 from annotator.canny import CannyDetector
 from annotator.util import HWC3
 from paddlenlp.trainer import PdArgumentParser
 from PIL import Image
 from tqdm import tqdm
 
-from ppdiffusers import (ControlNetModel, DDIMScheduler,
-                         EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionAdapterPipeline,
-                         StableDiffusionControlNetPipeline, T2IAdapter)
+from ppdiffusers import (
+    ControlNetModel,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionAdapterPipeline,
+    StableDiffusionControlNetPipeline,
+    T2IAdapter,
+)
 
 DEFAULT_NEGATIVE_PROMPT = (
     "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, "
-    "fewer digits, cropped, worst quality, low quality")
+    "fewer digits, cropped, worst quality, low quality"
+)
 
 
 class CannyProcessor:
@@ -79,31 +85,34 @@ def set_seed(seed: int):
 
 
 def generate_images(
-        use_controlnet=False,
-        adapter_model_name_or_path=None,
-        sd_model_name_or_path=None,
-        batch_size=16,
-        test_dataset=None,
-        save_path="output",
-        guidance_scales=[3, 4, 5, 6, 7, 8],
-        num_inference_steps=50,
-        scheduler_type="ddim",
-        device="gpu",
-        max_generation_limits=1000,
-        use_text_cond=True,
-        use_default_neg_text_cond=True,
-        generate_control_image_processor_type=None,
-        eta=0.0, ):
+    use_controlnet=False,
+    adapter_model_name_or_path=None,
+    sd_model_name_or_path=None,
+    batch_size=16,
+    test_dataset=None,
+    save_path="output",
+    guidance_scales=[3, 4, 5, 6, 7, 8],
+    num_inference_steps=50,
+    scheduler_type="ddim",
+    device="gpu",
+    max_generation_limits=1000,
+    use_text_cond=True,
+    use_default_neg_text_cond=True,
+    generate_control_image_processor_type=None,
+    eta=0.0,
+):
     # set pipe
     paddle.set_device(device)
     if use_controlnet:
         controlnet = ControlNetModel.from_pretrained(adapter_model_name_or_path)
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            sd_model_name_or_path, controlnet=controlnet, safety_checker=None)
+            sd_model_name_or_path, controlnet=controlnet, safety_checker=None
+        )
     else:
         adapter = T2IAdapter.from_pretrained(adapter_model_name_or_path)
         pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            sd_model_name_or_path, adapter=adapter, safety_checker=None)
+            sd_model_name_or_path, adapter=adapter, safety_checker=None
+        )
     pipe.set_progress_bar_config(disable=True)
 
     # set scheduler
@@ -117,17 +126,14 @@ def generate_images(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif scheduler_type == "euler-ancestral":
         scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
     elif scheduler_type == "ddim":
         scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -136,7 +142,8 @@ def generate_images(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
     pipe.scheduler = scheduler
@@ -158,24 +165,21 @@ def generate_images(
         write_file = open(os.path.join(save_path, "caption.txt"), "w")
         i = 0
         for data in tqdm(test_dataset):
-            if (generate_control_image_processor_type ==
-                    "canny"):  # Canny mode needs to manually process the control image
-                data["adapter_cond"] = canny_processor.process_data_load(data[
-                    "pixel_values"])
+            if (
+                generate_control_image_processor_type == "canny"
+            ):  # Canny mode needs to manually process the control image
+                data["adapter_cond"] = canny_processor.process_data_load(data["pixel_values"])
             images = pipe(
                 data["input_ids"] if use_text_cond else "",
-                negative_prompt=DEFAULT_NEGATIVE_PROMPT
-                if use_default_neg_text_cond else "",
+                negative_prompt=DEFAULT_NEGATIVE_PROMPT if use_default_neg_text_cond else "",
                 image=data["adapter_cond"],
                 guidance_scale=float(cfg),
                 eta=eta,
-                num_inference_steps=num_inference_steps, )[0]
-            data["adapter_cond"].save(
-                os.path.join(cond_save_path, "{:05d}_000.png".format(i)))
-            data["pixel_values"].save(
-                os.path.join(origin_save_path, "{:05d}_000.png".format(i)))
-            write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"]
-                             .strip() + "\n")
+                num_inference_steps=num_inference_steps,
+            )[0]
+            data["adapter_cond"].save(os.path.join(cond_save_path, "{:05d}_000.png".format(i)))
+            data["pixel_values"].save(os.path.join(origin_save_path, "{:05d}_000.png".format(i)))
+            write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"].strip() + "\n")
             for image in images:
                 path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
                 image.save(path)
@@ -198,7 +202,8 @@ def generate_images(
             tokenizer=None,
             file_path=generate_args.file,
             do_image_processing=False,
-            do_text_processing=False, )
+            do_text_processing=False,
+        )
 
     else:
         test_dataset = TextImagePair(
@@ -210,7 +215,8 @@ def generate_images(
             interpolation="lanczos",
             data_format=generate_args.generate_data_format,
             control_image_processor=None,
-            do_image_processing=False, )
+            do_image_processing=False,
+        )
 
     generate_images(
         use_controlnet=generate_args.use_controlnet,
@@ -226,5 +232,5 @@ def generate_images(
         max_generation_limits=generate_args.max_generation_limits,
         use_text_cond=generate_args.use_text_cond,
         use_default_neg_text_cond=generate_args.use_default_neg_text_cond,
-        generate_control_image_processor_type=generate_args.
-        generate_control_image_processor_type, )
+        generate_control_image_processor_type=generate_args.generate_control_image_processor_type,
+    )
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
index 758e595e0ae59..01f4839ec21ff 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
@@ -39,8 +39,7 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -51,11 +50,11 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
         "--output_path",
         type=str,
         default="paddle_models/sd-v1-4-adapter-color",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
 
-    th_controlnet = DiffusersAdapterNetModel.from_pretrained(
-        args.pretrained_model_name_or_path)
+    th_controlnet = DiffusersAdapterNetModel.from_pretrained(args.pretrained_model_name_or_path)
     controlnet_state_dict = convert_to_ppdiffusers(th_controlnet)
     pp_controlnet = PPDiffusersAdapterNetModel.from_config(th_controlnet.config)
     pp_controlnet.set_dict(controlnet_state_dict)
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
index 824cb9d41f945..165fb8d562914 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
@@ -42,10 +42,7 @@ def convert_to_paddle(vae_or_unet, dtype="float32"):
 
 
 @patch_to(paddle.nn.Layer)
-def load_state_dict(self: paddle.nn.Layer,
-                    state_dict: dict,
-                    use_structured_name=True,
-                    strict=True):
+def load_state_dict(self: paddle.nn.Layer, state_dict: dict, use_structured_name=True, strict=True):
     orig = self.state_dict()
     orig_keys = set([k for k in orig.keys()])
     loaded_keys = set([k for k in state_dict.keys()])
@@ -76,29 +73,32 @@ def apply(name):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
 
     parser.add_argument(
         "--orig_t2i_adapter_project_path",
         type=str,
         default="pytorch/T2I-Adapter",
-        help="Path to a torch model parameters file", )
+        help="Path to a torch model parameters file",
+    )
     parser.add_argument(
         "--orig_t2i_adapter_pretrained_ckpt_path",
         type=str,
         default="ckpt/t2iadapter_openpose_sd14v1.pth",
-        help="Path to a torch model parameters file", )
+        help="Path to a torch model parameters file",
+    )
     parser.add_argument(
         "--ppdiffusers_t2i_adapter_model_config_path",
         type=str,
         default="ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json",
-        help="Path to a torch model parameters file", )
+        help="Path to a torch model parameters file",
+    )
     parser.add_argument(
         "--ppdiffusers_t2i_adapter_model_output_path",
         type=str,
         default="paddle_models/sd-v1-4-adapter-openpose_initialized",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
 
     import os
@@ -113,19 +113,21 @@ def apply(name):
         nums_rb=2,
         ksize=1,
         sk=True,
-        use_conv=False, )
+        use_conv=False,
+    )
     from ppdiffusers import T2IAdapter as paddle_network
 
-    Paddle_Model = paddle_network(
-        **read_json(args.ppdiffusers_t2i_adapter_model_config_path))
+    Paddle_Model = paddle_network(**read_json(args.ppdiffusers_t2i_adapter_model_config_path))
 
     torch_model = Torch_Model
     if args.orig_t2i_adapter_pretrained_ckpt_path:
         torch_model.load_state_dict(
             torch.load(
                 args.orig_t2i_adapter_pretrained_ckpt_path,
-                map_location=torch.device("cpu"), ),
-            strict=True, )
+                map_location=torch.device("cpu"),
+            ),
+            strict=True,
+        )
     # When orig_t2i_adapter_pretrained_ckpt_path is not specified, the randomly initialized torch weights are stored in orig_t2i_adapter_pretrained_ckpt_path
     else:
         torch.save(
@@ -133,7 +135,9 @@ def apply(name):
             os.path.join(
                 args.orig_t2i_adapter_project_path,
                 "ckpt",
-                "torch_t2i_model_initialized.pth", ), )
+                "torch_t2i_model_initialized.pth",
+            ),
+        )
     torch_model_dict = convert_adapter(torch_model.state_dict())
     numpy_state_dict = convert_to_paddle(torch_model_dict)
     paddle_model = Paddle_Model
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
index dd6cc4ced4689..45f7f2262e5fd 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
@@ -76,13 +76,15 @@ def convert_adapter_light(old_state_dict):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--output_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the store the result checkpoint.", )
+        help="Path to the store the result checkpoint.",
+    )
     parser.add_argument(
         "--is_adapter_light",
         default=False,
diff --git a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
index 432265f92f6db..172b6727c299f 100644
--- a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
+++ b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
@@ -30,60 +30,59 @@
     "--dataset_base_name",
     type=str,
     default="artv4_openpose_test13",
-    help="The dataset basename.", )
+    help="The dataset basename.",
+)
 parser.add_argument(
     "--ids_list_path",
     type=str,
     default="artv4_openpose_test13_ids.txt",
-    help="The ids list path.", )
+    help="The ids list path.",
+)
 parser.add_argument(
     "--ids_list_path",
     type=str,
     default="artv4_openpose_test13_ids.txt",
-    help="The ids list path.", )
+    help="The ids list path.",
+)
 parser.add_argument(
     "--source_prompt_list_one_path",
     type=str,
     default="prompts_artv4_openpose_test1_en_prompts.txt",
-    help="The first source prompt list path.", )
+    help="The first source prompt list path.",
+)
 parser.add_argument(
     "--source_prompt_list_two_path",
     type=str,
     default="prompts_artv4_openpose_test2_en_prompts.txt",
-    help="The second source prompt list path.", )
+    help="The second source prompt list path.",
+)
 parser.add_argument(
     "--source_prompt_list_three_path",
     type=str,
     default="prompts_artv4_openpose_test3_en_prompts.txt",
-    help="The third source prompt list path.", )
+    help="The third source prompt list path.",
+)
 parser.add_argument(
     "--dataset_prompt_json_name",
     type=str,
     default="prompt.json",
-    help="The dataset prompt json name.", )
+    help="The dataset prompt json name.",
+)
 args = parser.parse_args()
 
 
-def get_images_form_urls(ids_list,
-                         dir_path,
-                         dataset_base_name,
-                         type=None,
-                         is_resize=False):
+def get_images_form_urls(ids_list, dir_path, dataset_base_name, type=None, is_resize=False):
     for i, id in enumerate(tqdm(ids_list)):
         if dataset_base_name == "artv4_openpose_test13":
             if type == "原图":
-                img_url = (dataset_base_name_one_type_one_url_base +
-                           f"{id}/{id}_final00_control.png")
+                img_url = dataset_base_name_one_type_one_url_base + f"{id}/{id}_final00_control.png"
             elif type == "Openpose控制图":
-                img_url = (dataset_base_name_one_type_two_url_base +
-                           f"{id}/{id}_final00_control_openpose.png")
+                img_url = dataset_base_name_one_type_two_url_base + f"{id}/{id}_final00_control_openpose.png"
         if dataset_base_name == "artv4_openpose_test2":
             if type == "原图":
-                img_url = (dataset_base_name_two_type_one_url_base +
-                           f"{id}/{id}_final00_control.png")
+                img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control.png"
             elif type == "Openpose控制图":
-                img_url = (dataset_base_name_two_type_one_url_base +
-                           f"{id}/{id}_final00_control_openpose.png")
+                img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control_openpose.png"
         in_image = load_image(img_url)
         if is_resize:
             in_image = in_image.resize((512, 512))
@@ -93,9 +92,7 @@ def get_images_form_urls(ids_list,
 
 
 def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name):
-    with open(
-            os.path.join(dataset_base_name, args.dataset_prompt_json_name),
-            "w") as wf:
+    with open(os.path.join(dataset_base_name, args.dataset_prompt_json_name), "w") as wf:
         for i, id in enumerate(ids_list):
             which_prompt_list = int(id.split("_")[1][-1]) - 1
             which_prompt = int(id.split("_")[-1])
@@ -112,41 +109,16 @@ def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name):
 
 if __name__ == "__main__":
     dataset_base_name = args.dataset_base_name
-    ids_list = [
-        line.strip()
-        for line in open(
-            args.ids_list_path, "r", encoding="utf8").readlines()
-    ]
+    ids_list = [line.strip() for line in open(args.ids_list_path, "r", encoding="utf8").readlines()]
 
     source_prompt_lists = [
-        [
-            line.strip()
-            for line in open(
-                args.source_prompt_list_one_path, "r", encoding="utf8")
-            .readlines()
-        ],
-        [
-            line.strip()
-            for line in open(
-                args.source_prompt_list_two_path, "r", encoding="utf8")
-            .readlines()
-        ],
-        [
-            line.strip()
-            for line in open(
-                args.source_prompt_list_three_path, "r", encoding="utf8")
-            .readlines()
-        ],
+        [line.strip() for line in open(args.source_prompt_list_one_path, "r", encoding="utf8").readlines()],
+        [line.strip() for line in open(args.source_prompt_list_two_path, "r", encoding="utf8").readlines()],
+        [line.strip() for line in open(args.source_prompt_list_three_path, "r", encoding="utf8").readlines()],
     ]
 
     source_dir = os.path.join(dataset_base_name, "source")
     target_dir = os.path.join(dataset_base_name, "target")
-    get_images_form_urls(
-        ids_list,
-        source_dir,
-        dataset_base_name,
-        type="Openpose控制图",
-        is_resize=False)
-    get_images_form_urls(
-        ids_list, target_dir, dataset_base_name, type="原图", is_resize=False)
+    get_images_form_urls(ids_list, source_dir, dataset_base_name, type="Openpose控制图", is_resize=False)
+    get_images_form_urls(ids_list, target_dir, dataset_base_name, type="原图", is_resize=False)
     get_prompt_json_file(ids_list, source_prompt_lists, dataset_base_name)
diff --git a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
index 79180b0f624fe..7f5bb1a23ecb4 100644
--- a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
+++ b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
@@ -15,10 +15,14 @@
 import os
 
 import paddle
-from adapter import (AdapterLDM, AdapterLDMTrainer, DataArguments,
-                     ModelArguments, TextImagePair)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from adapter import (
+    AdapterLDM,
+    AdapterLDMTrainer,
+    DataArguments,
+    ModelArguments,
+    TextImagePair,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
 from paddlenlp.utils.log import logger
 
 
@@ -28,15 +32,14 @@ def unfreeze_params(params):
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, TrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # report to custom_visualdl
     training_args.report_to = ["custom_visualdl"]
     training_args.resolution = data_args.resolution
     training_args.image_logging_steps = model_args.image_logging_steps = (
-        math.ceil(model_args.image_logging_steps / training_args.logging_steps)
-        * training_args.logging_steps)
+        math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
+    )
     training_args.print_config(model_args, "Model")
     training_args.print_config(data_args, "Data")
 
@@ -44,16 +47,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -69,12 +70,14 @@ def main():
         interpolation="lanczos",
         tokenizer=model.tokenizer,
         control_image_processor=model.control_image_processor,
-        data_format=data_args.data_format, )
+        data_format=data_args.data_format,
+    )
     trainer = AdapterLDMTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
     # must set recompute after trainer init
     trainer.model.set_recompute(training_args.recompute)
 
diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image.py b/ppdiffusers/examples/text_to_image/train_text_to_image.py
index d9e9e7295e0d5..95328abbff75f 100644
--- a/ppdiffusers/examples/text_to_image/train_text_to_image.py
+++ b/ppdiffusers/examples/text_to_image/train_text_to_image.py
@@ -27,8 +27,9 @@
 import paddle.nn.functional as F
 from datasets import DatasetDict, load_dataset
 from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
-    fused_allreduce_gradients
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
 from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
 from paddle.optimizer import AdamW
 from paddle.vision import BaseTransform, transforms
@@ -38,19 +39,27 @@
 from paddlenlp.utils.log import logger
 from tqdm.auto import tqdm
 
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
-                         UNet2DConditionModel, is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (EMAModel, freeze_params,
-                                        main_process_first, unwrap_model)
+from ppdiffusers.training_utils import (
+    EMAModel,
+    freeze_params,
+    main_process_first,
+    unwrap_model,
+)
 from ppdiffusers.utils import PPDIFFUSERS_CACHE, check_min_version
 
 check_min_version("0.16.1")
 
 
 def url_or_path_join(*path_list):
-    return (os.path.join(*path_list)
-            if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
 
 
 class Lambda(BaseTransform):
@@ -62,11 +71,11 @@ def _apply_image(self, img):
         return self.fn(img)
 
 
-def import_model_class_from_model_name_or_path(
-        pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
     try:
         text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+        )
         model_class = text_encoder_config.architectures[0]
     except Exception:
         model_class = "LDMBertModel"
@@ -75,8 +84,9 @@ def import_model_class_from_model_name_or_path(
 
         return CLIPTextModel
     elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
-            RobertaSeriesModelWithTransformation
+        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+            RobertaSeriesModelWithTransformation,
+        )
 
         return RobertaSeriesModelWithTransformation
     elif model_class == "BertModel":
@@ -84,8 +94,9 @@ def import_model_class_from_model_name_or_path(
 
         return BertModel
     elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
-            LDMBertModel
+        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+            LDMBertModel,
+        )
 
         return LDMBertModel
     else:
@@ -101,8 +112,7 @@ def fn(layer):
         # unet
         if hasattr(layer, "gradient_checkpointing"):
             layer.gradient_checkpointing = value
-            print("Set", layer.__class__, "recompute",
-                  layer.gradient_checkpointing)
+            print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
 
     model.apply(fn)
 
@@ -122,8 +132,7 @@ def get_report_to(args):
 
 
 def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training a text to image model script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training a text to image model script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -140,7 +149,8 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--train_text_encoder",
         action="store_true",
-        help="Whether to train the text encoder.", )
+        help="Whether to train the text encoder.",
+    )
     parser.add_argument(
         "--dataset_name",
         type=str,
@@ -148,7 +158,8 @@ def parse_args(input_args=None):
         help=(
             "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
             " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."),
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
     )
     parser.add_argument(
         "--dataset_config_name",
@@ -164,12 +175,14 @@ def parse_args(input_args=None):
             "A folder containing the training data. Folder contents must follow the structure described in"
             " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
             " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--image_column",
         type=str,
         default="image",
-        help="The column of the dataset containing an image.", )
+        help="The column of the dataset containing an image.",
+    )
     parser.add_argument(
         "--caption_column",
         type=str,
@@ -182,7 +195,9 @@ def parse_args(input_args=None):
         default=None,
         help=(
             "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."), )
+            "value if set."
+        ),
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -195,32 +210,34 @@ def parse_args(input_args=None):
         default=None,
         help="The directory where the downloaded models and datasets will be stored.",
     )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--height",
         type=int,
         default=None,
         help=(
             "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"), )
+            " height"
+        ),
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
         help=(
             "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"), )
+            " width"
+        ),
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         default=512,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--center_crop",
         default=False,
@@ -228,16 +245,19 @@ def parse_args(input_args=None):
         help=(
             "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
             " cropped. The images will be resized to the resolution first before cropping."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--random_flip",
         action="store_true",
-        help="whether to randomly flip images horizontally", )
+        help="whether to randomly flip images horizontally",
+    )
     parser.add_argument(
         "--train_batch_size",
         type=int,
         default=16,
-        help="Batch size (per device) for the training dataloader.", )
+        help="Batch size (per device) for the training dataloader.",
+    )
     parser.add_argument("--num_train_epochs", type=int, default=100)
     parser.add_argument(
         "--max_train_steps",
@@ -274,18 +294,22 @@ def parse_args(input_args=None):
         default="constant",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'), )
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
     parser.add_argument(
         "--lr_warmup_steps",
         type=int,
         default=500,
-        help="Number of steps for the warmup in the lr scheduler.", )
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
     parser.add_argument(
         "--snr_gamma",
         type=float,
         default=None,
         help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
-        "More details here: https://arxiv.org/abs/2303.09556.", )
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
     parser.add_argument(
         "--lr_num_cycles",
         type=int,
@@ -296,51 +320,49 @@ def parse_args(input_args=None):
         "--lr_power",
         type=float,
         default=1.0,
-        help="Power factor of the polynomial scheduler.", )
-    parser.add_argument(
-        "--use_ema", action="store_true", help="Whether to use EMA model.")
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Whether to debug this training script.")
+        help="Power factor of the polynomial scheduler.",
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.")
     parser.add_argument(
         "--dataloader_num_workers",
         type=int,
         default=0,
         help=(
             "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--adam_beta1",
         type=float,
         default=0.9,
-        help="The beta1 parameter for the Adam optimizer.", )
+        help="The beta1 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_beta2",
         type=float,
         default=0.999,
-        help="The beta2 parameter for the Adam optimizer.", )
-    parser.add_argument(
-        "--adam_weight_decay",
-        type=float,
-        default=1e-2,
-        help="Weight decay to use.")
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
     parser.add_argument(
         "--adam_epsilon",
         type=float,
         default=1e-08,
-        help="Epsilon value for the Adam optimizer", )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
         "--push_to_hub",
         action="store_true",
-        help="Whether or not to push the model to the Hub.", )
+        help="Whether or not to push the model to the Hub.",
+    )
     parser.add_argument(
         "--hub_token",
         type=str,
         default=None,
-        help="The token to use to push to the Model Hub.", )
+        help="The token to use to push to the Model Hub.",
+    )
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -353,27 +375,28 @@ def parse_args(input_args=None):
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"), )
+            "*output_dir/logs"
+        ),
+    )
     parser.add_argument(
         "--report_to",
         type=str,
         default="visualdl",
         choices=["tensorboard", "visualdl"],
-        help="Log writer type.", )
+        help="Log writer type.",
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=int,
         default=500,
-        help=("Save a checkpoint of the training state every X updates."), )
+        help=("Save a checkpoint of the training state every X updates."),
+    )
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether or not to use xformers.", )
-    parser.add_argument(
-        "--noise_offset",
-        type=float,
-        default=0,
-        help="The scale of noise offset.")
+        help="Whether or not to use xformers.",
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
     if input_args is not None:
         args = parser.parse_args(input_args)
     else:
@@ -389,9 +412,7 @@ def parse_args(input_args=None):
     return args
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -401,7 +422,9 @@ def get_full_repo_name(model_id: str,
         return f"{organization}/{model_id}"
 
 
-DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"), }
+DATASET_NAME_MAPPING = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
 
 
 def main():
@@ -422,16 +445,13 @@ def main():
             os.makedirs(args.output_dir, exist_ok=True)
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_name = get_full_repo_name(
-                    Path(args.output_dir).name, token=args.hub_token)
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(
-                args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
 
-            with open(os.path.join(args.output_dir, ".gitignore"),
-                      "w+") as gitignore:
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
                     gitignore.write("step_*\n")
                 if "epoch_*" not in gitignore:
@@ -441,30 +461,26 @@ def main():
     if args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
     elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
 
     # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path)
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
 
     # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
     text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
-    text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
-                   else text_encoder.config.to_dict())
-    if (text_config.get("use_attention_mask", None) is not None and
-            text_config["use_attention_mask"]):
+        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+    )
+    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
         use_attention_mask = True
     else:
         use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_model_name_or_path,
-        subfolder="unet", )
+        subfolder="unet",
+    )
 
     freeze_params(vae.parameters())
     if not args.train_text_encoder:
@@ -472,7 +488,8 @@ def main():
     if args.use_ema:
         ema_unet = UNet2DConditionModel.from_pretrained(
             args.pretrained_model_name_or_path,
-            subfolder="unet", )
+            subfolder="unet",
+        )
         ema_unet = EMAModel(ema_unet.parameters())
 
     if args.gradient_checkpointing:
@@ -480,14 +497,14 @@ def main():
         if args.train_text_encoder:
             set_recompute(text_encoder, True)
 
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
-    ):
+    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
         try:
             unet.enable_xformers_memory_efficient_attention()
         except Exception as e:
             logger.warn(
                 "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}")
+                f" correctly and a GPU is available: {e}"
+            )
 
     def compute_snr(timesteps):
         """
@@ -495,7 +512,7 @@ def compute_snr(timesteps):
         """
         alphas_cumprod = noise_scheduler.alphas_cumprod
         sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
 
         # Expand the tensors.
         # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
@@ -504,15 +521,13 @@ def compute_snr(timesteps):
             sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
         alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
 
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[
-            timesteps].cast("float32")
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32")
         while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
-                                                                          None]
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
         sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
 
         # Compute SNR.
-        snr = (alpha / sigma)**2
+        snr = (alpha / sigma) ** 2
         return snr
 
     # Get the datasets: you can either provide your own training and evaluation files (see below)
@@ -523,7 +538,8 @@ def compute_snr(timesteps):
     if args.debug:
         file_path = get_path_from_url_with_filelock(
             "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz",
-            PPDIFFUSERS_CACHE, )
+            PPDIFFUSERS_CACHE,
+        )
         dataset = DatasetDict.load_from_disk(file_path)
         args.dataset_name = "lambdalabs/pokemon-blip-captions"
     else:
@@ -532,7 +548,8 @@ def compute_snr(timesteps):
             dataset = load_dataset(
                 args.dataset_name,
                 args.dataset_config_name,
-                cache_dir=args.cache_dir, )
+                cache_dir=args.cache_dir,
+            )
         else:
             data_files = {}
             if args.train_data_dir is not None:
@@ -540,7 +557,8 @@ def compute_snr(timesteps):
             dataset = load_dataset(
                 "imagefolder",
                 data_files=data_files,
-                cache_dir=args.cache_dir, )
+                cache_dir=args.cache_dir,
+            )
             # See more about loading custom images at
             # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
 
@@ -551,8 +569,7 @@ def compute_snr(timesteps):
     # 6. Get the column names for input/target.
     dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
     if args.image_column is None:
-        image_column = (dataset_columns[0]
-                        if dataset_columns is not None else column_names[0])
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
     else:
         image_column = args.image_column
         if image_column not in column_names:
@@ -560,8 +577,7 @@ def compute_snr(timesteps):
                 f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
             )
     if args.caption_column is None:
-        caption_column = (dataset_columns[1]
-                          if dataset_columns is not None else column_names[1])
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
     else:
         caption_column = args.caption_column
         if caption_column not in column_names:
@@ -578,8 +594,7 @@ def tokenize_captions(examples, is_train=True):
                 captions.append(caption)
             elif isinstance(caption, (list, np.ndarray)):
                 # take a random caption if there are multiple
-                captions.append(
-                    random.choice(caption) if is_train else caption[0])
+                captions.append(random.choice(caption) if is_train else caption[0])
             else:
                 raise ValueError(
                     f"Caption column `{caption_column}` should contain either strings or lists of strings."
@@ -589,20 +604,22 @@ def tokenize_captions(examples, is_train=True):
             max_length=tokenizer.model_max_length,
             padding="do_not_pad",
             truncation=True,
-            return_attention_mask=False, )
+            return_attention_mask=False,
+        )
         return inputs.input_ids
 
     # Preprocessing the datasets.
-    train_transforms = transforms.Compose([
-        transforms.Resize(
-            (args.height, args.width), interpolation="bilinear"),
-        transforms.CenterCrop((args.height, args.width)) if args.center_crop
-        else transforms.RandomCrop((args.height, args.width)),
-        transforms.RandomHorizontalFlip()
-        if args.random_flip else Lambda(lambda x: x),
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5]),
-    ])
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize((args.height, args.width), interpolation="bilinear"),
+            transforms.CenterCrop((args.height, args.width))
+            if args.center_crop
+            else transforms.RandomCrop((args.height, args.width)),
+            transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
 
     def preprocess_train(examples):
         images = [image.convert("RGB") for image in examples[image_column]]
@@ -612,47 +629,42 @@ def preprocess_train(examples):
 
     with main_process_first():
         if args.max_train_samples is not None:
-            dataset["train"] = (dataset["train"].shuffle(seed=args.seed)
-                                .select(range(args.max_train_samples)))
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
         # Set the training transforms
         train_dataset = dataset["train"].with_transform(preprocess_train)
 
     def collate_fn(examples):
-        pixel_values = paddle.stack(
-            [example["pixel_values"] for example in examples]).cast("float32")
+        pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32")
         input_ids = [example["input_ids"] for example in examples]
         input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
+            {"input_ids": input_ids},
             padding="max_length",
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids
+            return_tensors="pd",
+        ).input_ids
         return {
             "input_ids": input_ids,
             "pixel_values": pixel_values,
         }
 
-    train_sampler = (DistributedBatchSampler(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True)
-                     if num_processes > 1 else BatchSampler(
-                         train_dataset,
-                         batch_size=args.train_batch_size,
-                         shuffle=True))
+    train_sampler = (
+        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+        if num_processes > 1
+        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    )
     train_dataloader = DataLoader(
         train_dataset,
         batch_sampler=train_sampler,
         collate_fn=collate_fn,
-        num_workers=args.dataloader_num_workers, )
+        num_workers=args.dataloader_num_workers,
+    )
 
     # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps /
-                                      num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     if num_processes > 1:
         unet = paddle.DataParallel(unet)
@@ -660,23 +672,22 @@ def collate_fn(examples):
             text_encoder = paddle.DataParallel(text_encoder)
 
     params_to_optimize = (
-        list(unet.parameters()) + list(text_encoder.parameters())
-        if args.train_text_encoder else unet.parameters())
+        list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
 
     if args.scale_lr:
-        args.learning_rate = (args.learning_rate *
-                              args.gradient_accumulation_steps *
-                              args.train_batch_size * num_processes)
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+        )
 
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps *
-        args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps *
-        args.gradient_accumulation_steps,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
         num_cycles=args.lr_num_cycles,
-        power=args.lr_power, )
+        power=args.lr_power,
+    )
     # Initialize the optimizer
     optimizer = AdamW(
         learning_rate=lr_scheduler,
@@ -685,8 +696,8 @@ def collate_fn(examples):
         beta2=args.adam_beta2,
         weight_decay=args.adam_weight_decay,
         epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
-        if args.max_grad_norm > 0 else None, )
+        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+    )
 
     if is_main_process:
         logger.info("-----------  Configuration Arguments -----------")
@@ -696,25 +707,19 @@ def collate_fn(examples):
         writer = get_report_to(args)
 
     # Train!
-    total_batch_size = (args.train_batch_size * num_processes *
-                        args.gradient_accumulation_steps)
+    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not is_main_process)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
     progress_bar.set_description("Train Steps")
     global_step = 0
 
@@ -737,20 +742,19 @@ def collate_fn(examples):
             if args.noise_offset:
                 # https://www.crosslabs.org//blog/diffusion-with-offset-noise
                 noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1),
-                    dtype=latents.dtype)
+                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+                )
             batch_size = latents.shape[0]
             # Sample a random timestep for each image
-            timesteps = paddle.randint(
-                0, noise_scheduler.config.num_train_timesteps,
-                (batch_size, )).cast("int64")
+            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
 
             # Add noise to the latents according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-            if num_processes > 1 and (args.gradient_checkpointing or (
-                (step + 1) % args.gradient_accumulation_steps != 0)):
+            if num_processes > 1 and (
+                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+            ):
                 # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
                 # gradient_checkpointing, no_sync every where
                 # gradient_checkpointing + grad_acc, no_sync every where
@@ -758,68 +762,61 @@ def collate_fn(examples):
                 if args.train_text_encoder:
                     text_encoder_ctx_manager = text_encoder.no_sync()
                 else:
-                    text_encoder_ctx_manager = (contextlib.nullcontext()
-                                                if sys.version_info >= (3, 7)
-                                                else contextlib.suppress())
+                    text_encoder_ctx_manager = (
+                        contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                    )
             else:
-                unet_ctx_manager = (contextlib.nullcontext()
-                                    if sys.version_info >= (3, 7) else
-                                    contextlib.suppress())
-                text_encoder_ctx_manager = (contextlib.nullcontext()
-                                            if sys.version_info >= (3, 7) else
-                                            contextlib.suppress())
+                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                text_encoder_ctx_manager = (
+                    contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                )
 
             with text_encoder_ctx_manager:
                 # Get the text embedding for conditioning
                 if use_attention_mask:
-                    attention_mask = (batch["input_ids"] !=
-                                      tokenizer.pad_token_id).cast("int64")
+                    attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
                 else:
                     attention_mask = None
-                encoder_hidden_states = text_encoder(
-                    batch["input_ids"], attention_mask=attention_mask)[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
 
                 with unet_ctx_manager:
                     # Predict the noise residual / sample
-                    model_pred = unet(noisy_latents, timesteps,
-                                      encoder_hidden_states).sample
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                     # Get the target for loss depending on the prediction type
                     if noise_scheduler.config.prediction_type == "epsilon":
                         target = noise
                     elif noise_scheduler.config.prediction_type == "v_prediction":
-                        target = noise_scheduler.get_velocity(latents, noise,
-                                                              timesteps)
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
                     else:
-                        raise ValueError(
-                            f"Unknown prediction type {noise_scheduler.config.prediction_type}"
-                        )
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                     if args.snr_gamma is None:
                         loss = F.mse_loss(
                             model_pred.cast("float32"),
                             target.cast("float32"),
-                            reduction="mean", )
+                            reduction="mean",
+                        )
                     else:
                         # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                         # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                         # This is discussed in Section 4.2 of the same paper.
                         snr = compute_snr(timesteps)
-                        mse_loss_weights = (paddle.stack(
-                            [
-                                snr,
-                                args.snr_gamma * paddle.ones_like(timesteps)
-                            ],
-                            axis=1, ).min(1)[0] / snr)
+                        mse_loss_weights = (
+                            paddle.stack([snr, args.snr_gamma * paddle.ones_like(timesteps)], axis=1,).min(
+                                1
+                            )[0]
+                            / snr
+                        )
                         # We first calculate the original loss. Then we mean over the non-batch dimensions and
                         # rebalance the sample-wise losses with their respective loss weights.
                         # Finally, we take the mean of the rebalanced loss.
                         loss = F.mse_loss(
                             model_pred.cast("float32"),
                             target.cast("float32"),
-                            reduction="none", )
-                        loss = (loss.mean(axis=list(range(1, len(loss.shape))))
-                                * mse_loss_weights)
+                            reduction="none",
+                        )
+                        loss = loss.mean(axis=list(range(1, len(loss.shape)))) * mse_loss_weights
                         loss = loss.mean()
 
                     if args.gradient_accumulation_steps > 1:
@@ -851,13 +848,10 @@ def collate_fn(examples):
                         writer.add_scalar(f"train/{name}", val, global_step)
 
                     if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir,
-                                                 f"checkpoint-{global_step}")
-                        unwrap_model(unet).save_pretrained(
-                            os.path.join(save_path, "unet"))
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet"))
                         if args.train_text_encoder:
-                            unwrap_model(text_encoder).save_pretrained(
-                                os.path.join(save_path, "text_encoder"))
+                            unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder"))
 
                 if global_step >= args.max_train_steps:
                     break
@@ -871,14 +865,12 @@ def collate_fn(examples):
         pipeline = DiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
             unet=unet,
-            text_encoder=unwrap_model(text_encoder), )
+            text_encoder=unwrap_model(text_encoder),
+        )
         pipeline.save_pretrained(args.output_dir)
 
         if args.push_to_hub:
-            repo.push_to_hub(
-                commit_message="End of training",
-                blocking=False,
-                auto_lfs_prune=True)
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
 
 if __name__ == "__main__":
diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
index b07bc09c1d1ae..611aebd6a5dc0 100644
--- a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
+++ b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
@@ -29,8 +29,9 @@
 import paddle.nn.functional as F
 from datasets import DatasetDict, load_dataset
 from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
-    fused_allreduce_gradients
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
 from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
 from paddle.optimizer import AdamW
 from paddle.vision import BaseTransform, transforms
@@ -40,31 +41,37 @@
 from paddlenlp.utils.log import logger
 from tqdm.auto import tqdm
 
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
-                         DPMSolverMultistepScheduler, UNet2DConditionModel,
-                         is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
 from ppdiffusers.models.attention_processor import (
-    AttnProcessor, AttnProcessor2_5, LoRAAttnProcessor, LoRAAttnProcessor2_5)
+    AttnProcessor,
+    AttnProcessor2_5,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_5,
+)
 from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (freeze_params, main_process_first,
-                                        unwrap_model)
-from ppdiffusers.utils import (PPDIFFUSERS_CACHE, TEXT_ENCODER_ATTN_MODULE,
-                               check_min_version)
+from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model
+from ppdiffusers.utils import (
+    PPDIFFUSERS_CACHE,
+    TEXT_ENCODER_ATTN_MODULE,
+    check_min_version,
+)
 
 check_min_version("0.16.1")
 
 
 def url_or_path_join(*path_list):
-    return (os.path.join(*path_list)
-            if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
 
 
-def save_model_card(repo_id: str,
-                    images=None,
-                    base_model=str,
-                    dataset_name=str,
-                    repo_folder=None):
+def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
     img_str = ""
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -94,11 +101,11 @@ def save_model_card(repo_id: str,
         f.write(yaml + model_card)
 
 
-def import_model_class_from_model_name_or_path(
-        pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
     try:
         text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+        )
         model_class = text_encoder_config.architectures[0]
     except Exception:
         model_class = "LDMBertModel"
@@ -107,8 +114,9 @@ def import_model_class_from_model_name_or_path(
 
         return CLIPTextModel
     elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
-            RobertaSeriesModelWithTransformation
+        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+            RobertaSeriesModelWithTransformation,
+        )
 
         return RobertaSeriesModelWithTransformation
     elif model_class == "BertModel":
@@ -116,8 +124,9 @@ def import_model_class_from_model_name_or_path(
 
         return BertModel
     elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
-            LDMBertModel
+        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+            LDMBertModel,
+        )
 
         return LDMBertModel
     else:
@@ -148,8 +157,7 @@ def get_report_to(args):
 
 
 def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training text to image lora script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training text to image lora script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -170,7 +178,8 @@ def parse_args(input_args=None):
         help=(
             "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
             " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."),
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
     )
     parser.add_argument(
         "--dataset_config_name",
@@ -186,12 +195,14 @@ def parse_args(input_args=None):
             "A folder containing the training data. Folder contents must follow the structure described in"
             " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
             " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--image_column",
         type=str,
         default="image",
-        help="The column of the dataset containing an image.", )
+        help="The column of the dataset containing an image.",
+    )
     parser.add_argument(
         "--caption_column",
         type=str,
@@ -202,7 +213,8 @@ def parse_args(input_args=None):
         "--validation_prompt",
         type=str,
         default=None,
-        help="A prompt that is sampled during training for inference.", )
+        help="A prompt that is sampled during training for inference.",
+    )
     parser.add_argument(
         "--num_validation_images",
         type=int,
@@ -216,14 +228,17 @@ def parse_args(input_args=None):
         help=(
             "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
             " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--max_train_samples",
         type=int,
         default=None,
         help=(
             "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."), )
+            "value if set."
+        ),
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -236,32 +251,34 @@ def parse_args(input_args=None):
         default=None,
         help="The directory where the downloaded models and datasets will be stored.",
     )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--height",
         type=int,
         default=None,
         help=(
             "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"), )
+            " height"
+        ),
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
         help=(
             "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"), )
+            " width"
+        ),
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         default=512,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--center_crop",
         default=False,
@@ -269,21 +286,25 @@ def parse_args(input_args=None):
         help=(
             "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
             " cropped. The images will be resized to the resolution first before cropping."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--lora_rank",
         type=int,
         default=4,
-        help="The rank of lora linear.", )
+        help="The rank of lora linear.",
+    )
     parser.add_argument(
         "--random_flip",
         action="store_true",
-        help="whether to randomly flip images horizontally", )
+        help="whether to randomly flip images horizontally",
+    )
     parser.add_argument(
         "--train_batch_size",
         type=int,
         default=16,
-        help="Batch size (per device) for the training dataloader.", )
+        help="Batch size (per device) for the training dataloader.",
+    )
     parser.add_argument(
         "--train_text_encoder",
         action="store_true",
@@ -300,7 +321,8 @@ def parse_args(input_args=None):
         "--checkpointing_steps",
         type=int,
         default=500,
-        help=("Save a checkpoint of the training state every X updates."), )
+        help=("Save a checkpoint of the training state every X updates."),
+    )
     parser.add_argument(
         "--gradient_accumulation_steps",
         type=int,
@@ -330,12 +352,15 @@ def parse_args(input_args=None):
         default="constant",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'), )
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
     parser.add_argument(
         "--lr_warmup_steps",
         type=int,
         default=500,
-        help="Number of steps for the warmup in the lr scheduler.", )
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
     parser.add_argument(
         "--lr_num_cycles",
         type=int,
@@ -346,49 +371,48 @@ def parse_args(input_args=None):
         "--lr_power",
         type=float,
         default=1.0,
-        help="Power factor of the polynomial scheduler.", )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Whether to debug this training script.")
+        help="Power factor of the polynomial scheduler.",
+    )
+    parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.")
     parser.add_argument(
         "--dataloader_num_workers",
         type=int,
         default=0,
         help=(
             "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--adam_beta1",
         type=float,
         default=0.9,
-        help="The beta1 parameter for the Adam optimizer.", )
+        help="The beta1 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_beta2",
         type=float,
         default=0.999,
-        help="The beta2 parameter for the Adam optimizer.", )
-    parser.add_argument(
-        "--adam_weight_decay",
-        type=float,
-        default=1e-2,
-        help="Weight decay to use.")
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
     parser.add_argument(
         "--adam_epsilon",
         type=float,
         default=1e-08,
-        help="Epsilon value for the Adam optimizer", )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
         "--push_to_hub",
         action="store_true",
-        help="Whether or not to push the model to the Hub.", )
+        help="Whether or not to push the model to the Hub.",
+    )
     parser.add_argument(
         "--hub_token",
         type=str,
         default=None,
-        help="The token to use to push to the Model Hub.", )
+        help="The token to use to push to the Model Hub.",
+    )
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -401,22 +425,22 @@ def parse_args(input_args=None):
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"), )
+            "*output_dir/logs"
+        ),
+    )
     parser.add_argument(
         "--report_to",
         type=str,
         default="visualdl",
         choices=["tensorboard", "visualdl"],
-        help="Log writer type.", )
+        help="Log writer type.",
+    )
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether or not to use xformers.", )
-    parser.add_argument(
-        "--noise_offset",
-        type=float,
-        default=0,
-        help="The scale of noise offset.")
+        help="Whether or not to use xformers.",
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
     if input_args is not None:
         args = parser.parse_args(input_args)
     else:
@@ -432,9 +456,7 @@ def parse_args(input_args=None):
     return args
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -444,7 +466,9 @@ def get_full_repo_name(model_id: str,
         return f"{organization}/{model_id}"
 
 
-DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"), }
+DATASET_NAME_MAPPING = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
 
 
 def main():
@@ -465,16 +489,13 @@ def main():
             os.makedirs(args.output_dir, exist_ok=True)
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_name = get_full_repo_name(
-                    Path(args.output_dir).name, token=args.hub_token)
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(
-                args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
 
-            with open(os.path.join(args.output_dir, ".gitignore"),
-                      "w+") as gitignore:
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
                     gitignore.write("step_*\n")
                 if "epoch_*" not in gitignore:
@@ -484,44 +505,40 @@ def main():
     if args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
     elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
 
     # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path)
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
 
     # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
     text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
-    text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
-                   else text_encoder.config.to_dict())
-    if (text_config.get("use_attention_mask", None) is not None and
-            text_config["use_attention_mask"]):
+        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+    )
+    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
         use_attention_mask = True
     else:
         use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_model_name_or_path,
-        subfolder="unet", )
+        subfolder="unet",
+    )
 
     # We only train the additional adapter LoRA layers
     freeze_params(vae.parameters())
     freeze_params(text_encoder.parameters())
     freeze_params(unet.parameters())
 
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
-    ):
+    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
         try:
             unet.enable_xformers_memory_efficient_attention()
         except Exception as e:
             logger.warning(
                 "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}")
+                f" correctly and a GPU is available: {e}"
+            )
     # now we will add new LoRA weights to the attention layers
     # It's important to realize here how many attention weights will be added and of which sizes
     # The sizes of the attention layers consist only of two different variables:
@@ -538,14 +555,12 @@ def main():
     # Set correct lora layers
     unet_lora_attn_procs = {}
     for name, attn_processor in unet.attn_processors.items():
-        cross_attention_dim = (None if name.endswith("attn1.processor") else
-                               unet.config.cross_attention_dim)
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = unet.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[
-                block_id]
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
@@ -555,14 +570,13 @@ def main():
         elif isinstance(attn_processor, AttnProcessor2_5):
             lora_attn_processor_class = LoRAAttnProcessor2_5
         else:
-            raise ValueError(
-                f"Unknown attention processor type: {attn_processor.__class__.__name__}"
-            )
+            raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}")
 
         unet_lora_attn_procs[name] = lora_attn_processor_class(
             hidden_size=hidden_size,
             cross_attention_dim=cross_attention_dim,
-            rank=args.lora_rank, )
+            rank=args.lora_rank,
+        )
 
     unet.set_attn_processor(unet_lora_attn_procs)
     unet_lora_layers = AttnProcsLayers(unet.attn_processors)
@@ -578,10 +592,12 @@ def main():
                 text_lora_attn_procs[name] = LoRAAttnProcessor(
                     hidden_size=module.out_proj.weight.shape[1],
                     cross_attention_dim=None,
-                    rank=args.lora_rank, )
+                    rank=args.lora_rank,
+                )
         text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
         temp_pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, text_encoder=text_encoder)
+            args.pretrained_model_name_or_path, text_encoder=text_encoder
+        )
         temp_pipeline._modify_text_encoder(text_lora_attn_procs)
         text_encoder = temp_pipeline.text_encoder
         del temp_pipeline
@@ -594,7 +610,8 @@ def main():
     if args.debug:
         file_path = get_path_from_url_with_filelock(
             "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz",
-            PPDIFFUSERS_CACHE, )
+            PPDIFFUSERS_CACHE,
+        )
         dataset = DatasetDict.load_from_disk(file_path)
         args.dataset_name = "lambdalabs/pokemon-blip-captions"
     else:
@@ -603,7 +620,8 @@ def main():
             dataset = load_dataset(
                 args.dataset_name,
                 args.dataset_config_name,
-                cache_dir=args.cache_dir, )
+                cache_dir=args.cache_dir,
+            )
         else:
             data_files = {}
             if args.train_data_dir is not None:
@@ -611,7 +629,8 @@ def main():
             dataset = load_dataset(
                 "imagefolder",
                 data_files=data_files,
-                cache_dir=args.cache_dir, )
+                cache_dir=args.cache_dir,
+            )
             # See more about loading custom images at
             # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
 
@@ -622,8 +641,7 @@ def main():
     # 6. Get the column names for input/target.
     dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
     if args.image_column is None:
-        image_column = (dataset_columns[0]
-                        if dataset_columns is not None else column_names[0])
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
     else:
         image_column = args.image_column
         if image_column not in column_names:
@@ -631,8 +649,7 @@ def main():
                 f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
             )
     if args.caption_column is None:
-        caption_column = (dataset_columns[1]
-                          if dataset_columns is not None else column_names[1])
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
     else:
         caption_column = args.caption_column
         if caption_column not in column_names:
@@ -649,8 +666,7 @@ def tokenize_captions(examples, is_train=True):
                 captions.append(caption)
             elif isinstance(caption, (list, np.ndarray)):
                 # take a random caption if there are multiple
-                captions.append(
-                    random.choice(caption) if is_train else caption[0])
+                captions.append(random.choice(caption) if is_train else caption[0])
             else:
                 raise ValueError(
                     f"Caption column `{caption_column}` should contain either strings or lists of strings."
@@ -660,20 +676,22 @@ def tokenize_captions(examples, is_train=True):
             max_length=tokenizer.model_max_length,
             padding="do_not_pad",
             truncation=True,
-            return_attention_mask=False, )
+            return_attention_mask=False,
+        )
         return inputs.input_ids
 
     # Preprocessing the datasets.
-    train_transforms = transforms.Compose([
-        transforms.Resize(
-            (args.height, args.width), interpolation="bilinear"),
-        transforms.CenterCrop((args.height, args.width)) if args.center_crop
-        else transforms.RandomCrop((args.height, args.width)),
-        transforms.RandomHorizontalFlip()
-        if args.random_flip else Lambda(lambda x: x),
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5]),
-    ])
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize((args.height, args.width), interpolation="bilinear"),
+            transforms.CenterCrop((args.height, args.width))
+            if args.center_crop
+            else transforms.RandomCrop((args.height, args.width)),
+            transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
 
     def preprocess_train(examples):
         images = [image.convert("RGB") for image in examples[image_column]]
@@ -683,67 +701,62 @@ def preprocess_train(examples):
 
     with main_process_first():
         if args.max_train_samples is not None:
-            dataset["train"] = (dataset["train"].shuffle(seed=args.seed)
-                                .select(range(args.max_train_samples)))
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
         # Set the training transforms
         train_dataset = dataset["train"].with_transform(preprocess_train)
 
     def collate_fn(examples):
-        pixel_values = paddle.stack(
-            [example["pixel_values"] for example in examples]).cast("float32")
+        pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32")
         input_ids = [example["input_ids"] for example in examples]
         input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
+            {"input_ids": input_ids},
             padding="max_length",
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids
+            return_tensors="pd",
+        ).input_ids
         return {
             "input_ids": input_ids,
             "pixel_values": pixel_values,
         }
 
-    train_sampler = (DistributedBatchSampler(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True)
-                     if num_processes > 1 else BatchSampler(
-                         train_dataset,
-                         batch_size=args.train_batch_size,
-                         shuffle=True))
+    train_sampler = (
+        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+        if num_processes > 1
+        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    )
     train_dataloader = DataLoader(
         train_dataset,
         batch_sampler=train_sampler,
         collate_fn=collate_fn,
-        num_workers=args.dataloader_num_workers, )
+        num_workers=args.dataloader_num_workers,
+    )
 
     # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps /
-                                      num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     if args.scale_lr:
-        args.learning_rate = (args.learning_rate *
-                              args.gradient_accumulation_steps *
-                              args.train_batch_size * num_processes)
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+        )
 
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps *
-        args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps *
-        args.gradient_accumulation_steps,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
         num_cycles=args.lr_num_cycles,
-        power=args.lr_power, )
+        power=args.lr_power,
+    )
 
-    params_to_optimize = (list(unet_lora_layers.parameters()) +
-                          list(text_encoder_lora_layers.parameters())
-                          if args.train_text_encoder else
-                          unet_lora_layers.parameters())
+    params_to_optimize = (
+        list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters())
+        if args.train_text_encoder
+        else unet_lora_layers.parameters()
+    )
     # Optimizer creation
     optimizer = AdamW(
         learning_rate=lr_scheduler,
@@ -752,8 +765,8 @@ def collate_fn(examples):
         beta2=args.adam_beta2,
         weight_decay=args.adam_weight_decay,
         epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
-        if args.max_grad_norm > 0 else None, )
+        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+    )
 
     if num_processes > 1:
         unet = paddle.DataParallel(unet)
@@ -768,25 +781,19 @@ def collate_fn(examples):
         writer = get_report_to(args)
 
     # Train!
-    total_batch_size = (args.train_batch_size * num_processes *
-                        args.gradient_accumulation_steps)
+    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not is_main_process)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
     progress_bar.set_description("Train Steps")
     global_step = 0
     vae.eval()
@@ -807,52 +814,43 @@ def collate_fn(examples):
             if args.noise_offset:
                 # https://www.crosslabs.org/blog/diffusion-with-offset-noise
                 noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1),
-                    dtype=latents.dtype)
+                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+                )
             batch_size = latents.shape[0]
             # Sample a random timestep for each image
-            timesteps = paddle.randint(
-                0, noise_scheduler.config.num_train_timesteps,
-                (batch_size, )).cast("int64")
+            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
 
             # Add noise to the latents according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-            if num_processes > 1 and (args.gradient_checkpointing or (
-                (step + 1) % args.gradient_accumulation_steps != 0)):
+            if num_processes > 1 and (
+                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+            ):
                 # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
                 # gradient_checkpointing, no_sync every where
                 # gradient_checkpointing + grad_acc, no_sync every where
                 unet_ctx_manager = unet.no_sync()
             else:
-                unet_ctx_manager = (contextlib.nullcontext()
-                                    if sys.version_info >= (3, 7) else
-                                    contextlib.suppress())
+                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
             if use_attention_mask:
-                attention_mask = (
-                    batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
+                attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
             else:
                 attention_mask = None
-            encoder_hidden_states = text_encoder(
-                batch["input_ids"], attention_mask=attention_mask)[0]
+            encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
 
             with unet_ctx_manager:
                 # Predict the noise residual / sample
-                model_pred = unet(noisy_latents, timesteps,
-                                  encoder_hidden_states).sample
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
                 elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise,
-                                                          timesteps)
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
-                    raise ValueError(
-                        f"Unknown prediction type {noise_scheduler.config.prediction_type}"
-                    )
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                 loss = F.mse_loss(model_pred, target, reduction="mean")
 
@@ -883,52 +881,51 @@ def collate_fn(examples):
                         writer.add_scalar(f"train/{name}", val, global_step)
 
                     if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir,
-                                                 f"checkpoint-{global_step}")
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         # We combine the text encoder and UNet LoRA parameters with a simple
                         # custom logic. So, use `LoraLoaderMixin.save_lora_weights()`.
                         LoraLoaderMixin.save_lora_weights(
                             save_directory=save_path,
                             unet_lora_layers=unet_lora_layers,
-                            text_encoder_lora_layers=text_encoder_lora_layers, )
+                            text_encoder_lora_layers=text_encoder_lora_layers,
+                        )
                         logger.info(f"Saved lora weights to {save_path}")
 
                 if global_step >= args.max_train_steps:
                     break
 
         if is_main_process:
-            if (args.validation_prompt is not None and
-                    epoch % args.validation_epochs == 0):
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
                 logger.info(
                     f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}.")
+                    f" {args.validation_prompt}."
+                )
                 # create pipeline
                 pipeline = DiffusionPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
                     unet=unwrap_model(unet),
                     text_encoder=unwrap_model(text_encoder),
                     safety_checker=None,
-                    requires_safety_checker=False, )
+                    requires_safety_checker=False,
+                )
                 pipeline.set_progress_bar_config(disable=True)
 
                 # run inference
-                generator = (paddle.Generator().manual_seed(args.seed)
-                             if args.seed else None)
+                generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
                 images = [
                     pipeline(
                         args.validation_prompt,
                         num_inference_steps=30,
-                        generator=generator, ).images[0]
+                        generator=generator,
+                    ).images[0]
                     for _ in range(args.num_validation_images)
                 ]
                 np_images = np.stack([np.asarray(img) for img in images])
 
                 if args.report_to == "tensorboard":
-                    writer.add_images(
-                        "validation", np_images, epoch, dataformats="NHWC")
+                    writer.add_images("validation", np_images, epoch, dataformats="NHWC")
                 else:
-                    writer.add_image(
-                        "validation", np_images, epoch, dataformats="NHWC")
+                    writer.add_image("validation", np_images, epoch, dataformats="NHWC")
 
                 del pipeline
                 gc.collect()
@@ -941,7 +938,8 @@ def collate_fn(examples):
         LoraLoaderMixin.save_lora_weights(
             save_directory=args.output_dir,
             unet_lora_layers=unet_lora_layers,
-            text_encoder_lora_layers=text_encoder_lora_layers, )
+            text_encoder_lora_layers=text_encoder_lora_layers,
+        )
 
         if args.push_to_hub:
             save_model_card(
@@ -949,31 +947,25 @@ def collate_fn(examples):
                 images=images,
                 base_model=args.pretrained_model_name_or_path,
                 prompt=args.instance_prompt,
-                repo_folder=args.output_dir, )
-            repo.push_to_hub(
-                commit_message="End of training",
-                blocking=False,
-                auto_lfs_prune=True)
+                repo_folder=args.output_dir,
+            )
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
         # Final inference
         # Load previous pipeline
         pipeline = DiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
             safety_checker=None,
-            requires_safety_checker=False, )
-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-            pipeline.scheduler.config)
+            requires_safety_checker=False,
+        )
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
         # load attention processors
         pipeline.load_lora_weights(args.output_dir)
 
         # run inference
         if args.validation_prompt and args.num_validation_images > 0:
-            generator = paddle.Generator().manual_seed(
-                args.seed) if args.seed else None
+            generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
             images = [
-                pipeline(
-                    args.validation_prompt,
-                    num_inference_steps=30,
-                    generator=generator).images[0]
+                pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
                 for _ in range(args.num_validation_images)
             ]
             np_images = np.stack([np.asarray(img) for img in images])
diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
index 9f7e732a9033e..c8527964620b4 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
@@ -20,9 +20,13 @@
 import pandas as pd
 from tqdm.auto import tqdm
 
-from ppdiffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler,
-                         LDMTextToImagePipeline, LMSDiscreteScheduler,
-                         PNDMScheduler)
+from ppdiffusers import (
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 
 
 def batchify(data, batch_size=16):
@@ -37,18 +41,19 @@ def batchify(data, batch_size=16):
 
 
 def generate_images(
-        model_name_or_path,
-        batch_size=16,
-        file="coco30k.csv",
-        save_path="output",
-        seed=42,
-        scheduler_type="ddim",
-        eta=0.0,
-        num_inference_steps=50,
-        guidance_scales=[3, 4, 5, 6, 7, 8],
-        height=256,
-        width=256,
-        device="gpu", ):
+    model_name_or_path,
+    batch_size=16,
+    file="coco30k.csv",
+    save_path="output",
+    seed=42,
+    scheduler_type="ddim",
+    eta=0.0,
+    num_inference_steps=50,
+    guidance_scales=[3, 4, 5, 6, 7, 8],
+    height=256,
+    width=256,
+    device="gpu",
+):
     paddle.set_device(device)
     pipe = LDMTextToImagePipeline.from_pretrained(model_name_or_path)
     pipe.set_progress_bar_config(disable=True)
@@ -62,17 +67,14 @@ def generate_images(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif scheduler_type == "euler-ancestral":
         scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
     elif scheduler_type == "ddim":
         scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -81,7 +83,8 @@ def generate_images(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
     pipe.scheduler = scheduler
@@ -103,7 +106,8 @@ def generate_images(
                 eta=eta,
                 height=height,
                 width=width,
-                num_inference_steps=num_inference_steps, )[0]
+                num_inference_steps=num_inference_steps,
+            )[0]
             for image in images:
                 path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
                 image.save(path)
@@ -117,17 +121,20 @@ def generate_images(
         default=None,
         type=str,
         required=True,
-        help="model_name_or_path.", )
+        help="model_name_or_path.",
+    )
     parser.add_argument(
         "--file",
         default="./coco30k.tsv",
         type=str,
-        help="eval file.", )
+        help="eval file.",
+    )
     parser.add_argument(
         "--seed",
         default=42,
         type=int,
-        help="random seed.", )
+        help="random seed.",
+    )
     parser.add_argument(
         "--scheduler_type",
         default="ddim",
@@ -137,22 +144,20 @@ def generate_images(
     )
     parser.add_argument("--device", default="gpu", type=str, help="device")
     parser.add_argument("--batch_size", default=16, type=int, help="batch_size")
-    parser.add_argument(
-        "--num_inference_steps",
-        default=50,
-        type=int,
-        help="num_inference_steps")
+    parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps")
     parser.add_argument(
         "--save_path",
         default="output/1.5b_ldm/12w.pd",
         type=str,
-        help="Path to the output file.", )
+        help="Path to the output file.",
+    )
     parser.add_argument(
         "--guidance_scales",
         default=[3, 4, 5, 6, 7, 8],
         nargs="+",
         type=str,
-        help="guidance_scales list.", )
+        help="guidance_scales list.",
+    )
     parser.add_argument("--height", default=256, type=int, help="height.")
     parser.add_argument("--width", default=256, type=int, help="width.")
     args = parser.parse_args()
@@ -171,4 +176,5 @@ def generate_images(
         scheduler_type=args.scheduler_type,
         height=args.height,
         width=args.width,
-        device=args.device, )
+        device=args.device,
+    )
diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
index c89f6fd190bf7..069fde479ce3d 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
@@ -19,8 +19,13 @@
 from paddlenlp.transformers import AutoTokenizer
 from paddlenlp.utils.log import logger
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LDMBertModel,
-                         LDMTextToImagePipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LDMBertModel,
+    LDMTextToImagePipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig
 
 
@@ -30,27 +35,32 @@ def parse_args():
         "--model_file",
         type=str,
         default="./model_state.pdparams",
-        help="path to pretrained model_state.pdparams", )
+        help="path to pretrained model_state.pdparams",
+    )
     parser.add_argument(
         "--output_path",
         type=str,
         default="./ldm_pipelines",
-        help="the output path of pipeline.", )
+        help="the output path of pipeline.",
+    )
     parser.add_argument(
         "--vae_name_or_path",
         type=str,
         default="CompVis/stable-diffusion-v1-4/vae",
-        help="pretrained_vae_name_or_path.", )
+        help="pretrained_vae_name_or_path.",
+    )
     parser.add_argument(
         "--text_encoder_config_file",
         type=str,
         default="./config/ldmbert.json",
-        help="text_encoder_config_file.", )
+        help="text_encoder_config_file.",
+    )
     parser.add_argument(
         "--unet_config_file",
         type=str,
         default="./config/unet.json",
-        help="unet_config_file.", )
+        help="unet_config_file.",
+    )
     parser.add_argument(
         "--tokenizer_name_or_path",
         type=str,
@@ -61,12 +71,9 @@ def parse_args():
         "--model_max_length",
         type=int,
         default=77,
-        help="Pretrained tokenizer model_max_length.", )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default=None,
-        help="Device to use. Like gpu:0 or cpu")
+        help="Pretrained tokenizer model_max_length.",
+    )
+    parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu:0 or cpu")
 
     return parser.parse_args()
 
@@ -119,17 +126,17 @@ def check_keys(model, state_dict):
 
 
 def build_pipelines(
-        model_file,
-        output_path,
-        vae_name_or_path,
-        unet_config_file,
-        text_encoder_config_file,
-        tokenizer_name_or_path="bert-base-uncased",
-        model_max_length=77, ):
+    model_file,
+    output_path,
+    vae_name_or_path,
+    unet_config_file,
+    text_encoder_config_file,
+    tokenizer_name_or_path="bert-base-uncased",
+    model_max_length=77,
+):
     vae = AutoencoderKL.from_config(vae_name_or_path)
     unet = UNet2DConditionModel(**read_json(unet_config_file))
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name_or_path, model_max_length=model_max_length)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=model_max_length)
     text_encoder_config = read_json(text_encoder_config_file)
     vocab_size = text_encoder_config["vocab_size"]
     max_position_embeddings = text_encoder_config["max_position_embeddings"]
@@ -143,8 +150,7 @@ def build_pipelines(
         logger.info(
             f"The tokenizer's model_max_length {tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {tokenizer.model_max_length} as max_position_embeddings!"
         )
-        text_encoder_config[
-            "max_position_embeddings"] = tokenizer.model_max_length
+        text_encoder_config["max_position_embeddings"] = tokenizer.model_max_length
     cofnig = LDMBertConfig(**text_encoder_config)
     text_encoder = LDMBertModel(cofnig)
     scheduler = DDIMScheduler(
@@ -154,7 +160,8 @@ def build_pipelines(
         # Make sure the scheduler compatible with DDIM
         clip_sample=False,
         set_alpha_to_one=False,
-        steps_offset=1, )
+        steps_offset=1,
+    )
     unet_dict, vae_dict, text_encoder_dict = extract_paramaters(model_file)
     check_keys(unet, unet_dict)
     check_keys(vae, vae_dict)
@@ -167,7 +174,8 @@ def build_pipelines(
         tokenizer=tokenizer,
         scheduler=scheduler,
         vqvae=vae,
-        unet=unet, )
+        unet=unet,
+    )
     pipe.save_pretrained(output_path)
 
 
@@ -182,4 +190,5 @@ def build_pipelines(
         unet_config_file=args.unet_config_file,
         text_encoder_config_file=args.text_encoder_config_file,
         tokenizer_name_or_path=args.tokenizer_name_or_path,
-        model_max_length=args.model_max_length, )
+        model_max_length=args.model_max_length,
+    )
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
index f7c2e091bed03..0443a7224578e 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
@@ -29,51 +29,43 @@ class ModelArguments:
     # use pretrained vae kl-8.ckpt (CompVis/stable-diffusion-v1-4/vae)
     vae_name_or_path: Optional[str] = field(
         default="CompVis/stable-diffusion-v1-4/vae",
-        metadata={"help": "pretrained_vae_name_or_path"}, )
+        metadata={"help": "pretrained_vae_name_or_path"},
+    )
     text_encoder_config_file: Optional[str] = field(
-        default="./config/ldmbert.json",
-        metadata={"help": "text_encoder_config_file"})
-    unet_config_file: Optional[str] = field(
-        default="./config/unet.json", metadata={"help": "unet_config_file"})
+        default="./config/ldmbert.json", metadata={"help": "text_encoder_config_file"}
+    )
+    unet_config_file: Optional[str] = field(default="./config/unet.json", metadata={"help": "unet_config_file"})
     tokenizer_name: Optional[str] = field(
         default="bert-base-uncased",
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as model_name"
-        }, )
-    model_max_length: Optional[int] = field(
-        default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    num_inference_steps: Optional[int] = field(
-        default=200, metadata={"help": "num_inference_steps"})
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
+    num_inference_steps: Optional[int] = field(default=200, metadata={"help": "num_inference_steps"})
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     pretrained_model_name_or_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
-    image_logging_steps: Optional[int] = field(
-        default=1000, metadata={"help": "Log image every X steps."})
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
+    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable_xformers_memory_efficient_attention."})
-    to_static: bool = field(
-        default=False, metadata={"help": "Whether or not to_static"})
+        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
+    )
+    to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"})
     prediction_type: Optional[str] = field(
         default="epsilon",
         metadata={
-            "help":
-            "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
-        }, )
+            "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
+        },
+    )
     benchmark: bool = field(
         default=False,
-        metadata={"help": "Whether or not run benchmark."}, )
+        metadata={"help": "Whether or not run benchmark."},
+    )
     profiler_options: Optional[str] = field(
         default=None,
-        metadata={"help": "profiler_options."}, )
-    noise_offset: Optional[int] = field(
-        default=0, metadata={"help": "The scale of noise offset."})
+        metadata={"help": "profiler_options."},
+    )
+    noise_offset: Optional[int] = field(default=0, metadata={"help": "The scale of noise offset."})
 
 
 @dataclass
@@ -84,113 +76,89 @@ class DataArguments:
 
     file_list: str = field(
         default="./data/filelist/train.filelist.list",
-        metadata={"help": "The name of the file_list."}, )
+        metadata={"help": "The name of the file_list."},
+    )
     resolution: int = field(
         default=256,
         metadata={
-            "help":
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        }, )
+            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
+        },
+    )
     num_records: int = field(default=10000000, metadata={"help": "num_records"})
     buffer_size: int = field(
         default=100,
-        metadata={"help": "Buffer size"}, )
+        metadata={"help": "Buffer size"},
+    )
     shuffle_every_n_samples: int = field(
         default=5,
-        metadata={"help": "shuffle_every_n_samples."}, )
+        metadata={"help": "shuffle_every_n_samples."},
+    )
 
 
 @dataclass
 class NoTrainerTrainingArguments:
     output_dir: str = field(
         default="outputs",
-        metadata={
-            "help":
-            "The output directory where the model predictions and checkpoints will be written."
-        }, )
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
     per_device_train_batch_size: int = field(
-        default=16,
-        metadata={"help": "Batch size per GPU core/CPU for training."})
+        default=16, metadata={"help": "Batch size per GPU core/CPU for training."}
+    )
 
     gradient_accumulation_steps: int = field(
         default=2,
-        metadata={
-            "help":
-            "Number of updates steps to accumulate before performing a backward/update pass."
-        }, )
-    learning_rate: float = field(
-        default=5e-5,
-        metadata={"help": "The initial learning rate for AdamW."})
-    weight_decay: float = field(
-        default=0.02,
-        metadata={"help": "Weight decay for AdamW if we apply some."})
-    adam_beta1: float = field(
-        default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
-    adam_beta2: float = field(
-        default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
-    adam_epsilon: float = field(
-        default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
-    max_grad_norm: float = field(
-        default=-1.0, metadata={"help": "Max gradient norm."})
-    num_train_epochs: int = field(
-        default=100,
-        metadata={"help": "Total number of training epochs to perform."})
+        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.02, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    max_grad_norm: float = field(default=-1.0, metadata={"help": "Max gradient norm."})
+    num_train_epochs: int = field(default=100, metadata={"help": "Total number of training epochs to perform."})
     max_steps: int = field(
         default=1000000000,
-        metadata={
-            "help":
-            "If > 0: set total number of training steps to perform. Override num_train_epochs."
-        }, )
+        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
+    )
     lr_scheduler_type: str = field(
         default="constant",
         metadata={
-            "help":
-            'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]'
-        }, )
-    warmup_steps: int = field(
-        default=0, metadata={"help": "Linear warmup over warmup_steps."})
+            "help": 'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]'
+        },
+    )
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
 
-    logging_dir: Optional[str] = field(
-        default="logs", metadata={"help": "VisualDL log dir."})
+    logging_dir: Optional[str] = field(default="logs", metadata={"help": "VisualDL log dir."})
 
-    logging_steps: int = field(
-        default=50, metadata={"help": "Log every X updates steps."})
+    logging_steps: int = field(default=50, metadata={"help": "Log every X updates steps."})
 
-    save_steps: int = field(
-        default=5000,
-        metadata={"help": "Save checkpoint every X updates steps."})
+    save_steps: int = field(default=5000, metadata={"help": "Save checkpoint every X updates steps."})
 
     seed: int = field(
         default=23,
-        metadata={
-            "help": "Random seed that will be set at the beginning of training."
-        }, )
+        metadata={"help": "Random seed that will be set at the beginning of training."},
+    )
     dataloader_num_workers: int = field(
         default=6,
         metadata={
-            "help":
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        }, )
+            "help": "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        },
+    )
     report_to: str = field(
         default="visualdl",
-        metadata={
-            "help":
-            "The list of integrations to report the results and logs to."
-        }, )
+        metadata={"help": "The list of integrations to report the results and logs to."},
+    )
     recompute: bool = field(
         default=False,
         metadata={
-            "help":
-            "Recompute the forward pass to calculate gradients. Used for saving memory. "
+            "help": "Recompute the forward pass to calculate gradients. Used for saving memory. "
             "Only support for networks with transformer blocks."
-        }, )
+        },
+    )
 
     def __str__(self):
         self_as_dict = asdict(self)
-        self_as_dict = {
-            k: f"<{k.upper()}>" if k.endswith("_token") else v
-            for k, v in self_as_dict.items()
-        }
+        self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}
 
         attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
         return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
@@ -207,8 +175,7 @@ def print_config(self, args=None, key=""):
             key = "Training"
 
         logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
-        logger.info("{:30}:{}".format("paddle commit id",
-                                      paddle.version.commit))
+        logger.info("{:30}:{}".format("paddle commit id", paddle.version.commit))
 
         for a in dir(args):
             if a[:2] != "__":  # don't print double underscore methods
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
index 9103c0221f18a..6a99ea7a8f8bc 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
@@ -20,7 +20,11 @@
 from paddle.io import DataLoader
 from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer
 from paddlenlp.trainer.integrations import (
-    INTEGRATION_TO_CALLBACK, TrainerCallback, VisualDLCallback, rewrite_logs)
+    INTEGRATION_TO_CALLBACK,
+    TrainerCallback,
+    VisualDLCallback,
+    rewrite_logs,
+)
 from paddlenlp.utils import profiler
 from paddlenlp.utils.log import logger
 
@@ -38,19 +42,17 @@ def autocast_smart_context_manager(self, args):
                     "c_softmax_with_cross_entropy",
                 ],
                 level=args.fp16_opt_level,
-                dtype=amp_dtype, )
+                dtype=amp_dtype,
+            )
         else:
-            ctx_manager = (contextlib.nullcontext()
-                           if sys.version_info >= (3, 7) else
-                           contextlib.suppress())
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
         return ctx_manager
 
     def on_step_end(self, args, state, control, model=None, **kwargs):
         if hasattr(model, "on_train_batch_end"):
             model.on_train_batch_end()
-        if (args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
             control.should_log = True
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -58,22 +60,26 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         inputs = kwargs.get("inputs", None)
         model = kwargs.get("model", None)
         image_logs = {}
-        if (inputs is not None and model is not None and
-                args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if (
+            inputs is not None
+            and model is not None
+            and args.image_logging_steps > 0
+            and state.global_step % args.image_logging_steps == 0
+        ):
             with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.decode_image(
-                    pixel_values=inputs["pixel_values"])
+                image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
                 image_logs["ddim-samples-1.0"] = model.log_image(
                     input_ids=inputs["input_ids"],
                     guidance_scale=1.0,
                     height=args.resolution,
-                    width=args.resolution, )
+                    width=args.resolution,
+                )
                 image_logs["ddim-samples-7.5"] = model.log_image(
                     input_ids=inputs["input_ids"],
                     guidance_scale=7.5,
                     height=args.resolution,
-                    width=args.resolution, )
+                    width=args.resolution,
+                )
 
         if not state.is_world_process_zero:
             return
@@ -91,11 +97,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         "Trainer is attempting to log a value of "
                         f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                         "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute.")
+                        "is incorrect so we dropped this attribute."
+                    )
             # log images
             for k, v in image_logs.items():
-                self.vdl_writer.add_image(
-                    k, v, state.global_step, dataformats="NHWC")
+                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
             self.vdl_writer.flush()
 
 
@@ -136,8 +142,7 @@ def __init__(self, benchmark=True, profiler_options=None):
         self.profiler_options = profiler_options
 
     def on_train_begin(self, args, state, control, **kwargs):
-        assert (args.gradient_accumulation_steps == 1 and not args.do_eval and
-                not args.do_predict)
+        assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
         if self.benchmark:
             self.reader_cost_avg = AverageStatistical()
 
@@ -162,8 +167,7 @@ def on_step_end(self, args, state, control, **kwargs):
     def on_log(self, args, state, control, logs=None, **kwargs):
         if self.benchmark:
             if logs is not None and "interval_steps_per_second" in logs:
-                self.batch_start = self.batch_start + (
-                    time.time() - self.maybe_log_save_evaluate_start)
+                self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
                 ips = logs["interval_steps_per_second"] * args.train_batch_size
                 avg_batch_cost = 1 / logs["interval_steps_per_second"]
                 logger.info(
@@ -175,14 +179,15 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         self.reader_cost_avg.get_average(),
                         avg_batch_cost,
                         args.train_batch_size,
-                        ips, ))
+                        ips,
+                    )
+                )
                 self.reader_cost_avg.reset()
 
     def on_epoch_end(self, args, state, control, **kwargs):
         if self.benchmark:
             train_epoch_cost = time.time() - self.epoch_start
-            logger.info("train epoch: %d, epoch_cost: %.5f s" %
-                        (state.epoch, train_epoch_cost))
+            logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
 
 
 # register visualdl_with_image
@@ -196,7 +201,9 @@ def __init__(self, **kwargs):
             self.add_callback(
                 BenchmarkCallback(
                     benchmark=self.args.benchmark,
-                    profiler_options=self.args.profiler_options, ))
+                    profiler_options=self.args.profiler_options,
+                )
+            )
             if self.args.benchmark:
                 if self.args.disable_tqdm:
                     self.pop_callback(PrinterCallback)
@@ -215,6 +222,7 @@ def get_train_dataloader(self):
                 self.train_dataset,
                 batch_size=self.args.train_batch_size,
                 num_workers=self.args.dataloader_num_workers,
-                worker_init_fn=worker_init_fn, )
+                worker_init_fn=worker_init_fn,
+            )
         else:
             return super().get_train_dataloader()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
index 2fe8ba07c5621..5b4bb009920c4 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
@@ -20,9 +20,14 @@
 import paddle.nn.functional as F
 from paddlenlp.transformers import AutoTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         LDMBertModel, UNet2DConditionModel,
-                         is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    LDMBertModel,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.models.attention import AttentionBlock
 from ppdiffusers.models.ema import LitEma
 from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig
@@ -31,15 +36,15 @@
 try:
     from ppdiffusers.models.attention import SpatialTransformer
 except ImportError:
-    from ppdiffusers.models.transformer_2d import (Transformer2DModel as
-                                                   SpatialTransformer, )
+    from ppdiffusers.models.transformer_2d import (
+        Transformer2DModel as SpatialTransformer,
+    )
 
 import json
 
 from paddlenlp.utils.log import logger
 
-from ppdiffusers.initializer import (normal_, reset_initialized_parameter,
-                                     zeros_)
+from ppdiffusers.initializer import normal_, reset_initialized_parameter, zeros_
 from ppdiffusers.models.resnet import ResnetBlock2D
 
 
@@ -55,31 +60,31 @@ def __init__(self, model_args):
         # init tokenizer
         tokenizer_name_or_path = (
             model_args.tokenizer_name
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "tokenizer"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+        )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path,
-            model_max_length=model_args.model_max_length)
+            tokenizer_name_or_path, model_max_length=model_args.model_max_length
+        )
 
         # init vae
         vae_name_or_path = (
             model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "vqvae"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "vqvae")
+        )
         self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
         freeze_params(self.vae.parameters())
         logger.info("Freeze vae parameters!")
 
         if model_args.pretrained_model_name_or_path is None:
             assert (
-                model_args.text_encoder_config_file is not None and
-                model_args.unet_config_file is not None
+                model_args.text_encoder_config_file is not None and model_args.unet_config_file is not None
             ), "we must supply text_encoder_config_file & unet_config_file"
             # init text_encoder
             text_encoder_config = read_json(model_args.text_encoder_config_file)
             vocab_size = text_encoder_config["vocab_size"]
-            max_position_embeddings = text_encoder_config[
-                "max_position_embeddings"]
+            max_position_embeddings = text_encoder_config["max_position_embeddings"]
             if self.tokenizer.vocab_size != vocab_size:
                 logger.info(
                     f"The tokenizer has a vocab size of {self.tokenizer.vocab_size}, while the text encoder has a vocab size of {vocab_size}, we will use {self.tokenizer.vocab_size} as vocab_size!"
@@ -90,24 +95,24 @@ def __init__(self, model_args):
                 logger.info(
                     f"The tokenizer's model_max_length {self.tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {self.tokenizer.model_max_length} as max_position_embeddings!"
                 )
-                text_encoder_config[
-                    "max_position_embeddings"] = self.tokenizer.model_max_length
+                text_encoder_config["max_position_embeddings"] = self.tokenizer.model_max_length
             config = LDMBertConfig(**text_encoder_config)
             self.text_encoder = LDMBertModel(config)
             self.text_encoder_is_pretrained = False
             # init unet2d
-            self.unet = UNet2DConditionModel(
-                **read_json(model_args.unet_config_file))
+            self.unet = UNet2DConditionModel(**read_json(model_args.unet_config_file))
             self.unet_is_pretrained = False
         else:
             # init text_encoder
             self.text_encoder = LDMBertModel.from_pretrained(
-                model_args.pretrained_model_name_or_path, subfolder="bert")
+                model_args.pretrained_model_name_or_path, subfolder="bert"
+            )
 
             self.text_encoder_is_pretrained = True
             # init unet2d
             self.unet = UNet2DConditionModel.from_pretrained(
-                model_args.pretrained_model_name_or_path, subfolder="unet")
+                model_args.pretrained_model_name_or_path, subfolder="unet"
+            )
             self.unet_is_pretrained = True
 
         assert model_args.prediction_type in ["epsilon", "v_prediction"]
@@ -117,9 +122,9 @@ def __init__(self, model_args):
             beta_end=0.012,
             beta_schedule="scaled_linear",
             num_train_timesteps=1000,
-            prediction_type=self.prediction_type, )
-        self.register_buffer("alphas_cumprod",
-                             self.noise_scheduler.alphas_cumprod)
+            prediction_type=self.prediction_type,
+        )
+        self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
 
         if model_args.image_logging_steps > 0:
             self.eval_scheduler = DDIMScheduler(
@@ -130,7 +135,8 @@ def __init__(self, model_args):
                 clip_sample=False,
                 set_alpha_to_one=False,
                 steps_offset=1,
-                prediction_type=self.prediction_type, )
+                prediction_type=self.prediction_type,
+            )
             self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
         self.init_weights()
         self.use_ema = model_args.use_ema
@@ -138,14 +144,14 @@ def __init__(self, model_args):
         if self.use_ema:
             self.model_ema = LitEma(self.unet)
 
-        if (model_args.enable_xformers_memory_efficient_attention and
-                is_ppxformers_available()):
+        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
             try:
                 self.unet.enable_xformers_memory_efficient_attention()
             except Exception as e:
                 logger.warn(
                     "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}")
+                    f" correctly and a GPU is available: {e}"
+                )
 
         # make sure unet text_encoder in train mode, vae in eval mode
         self.unet.train()
@@ -153,35 +159,31 @@ def __init__(self, model_args):
         self.vae.eval()
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    def get_velocity(self,
-                     sample: paddle.Tensor,
-                     noise: paddle.Tensor,
-                     timesteps: paddle.Tensor) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5
+    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(sample.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
         while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -193,10 +195,8 @@ def init_weights(self):
         # init text_encoder
         if not self.text_encoder_is_pretrained:
             reset_initialized_parameter(self.text_encoder)
-            normal_(self.text_encoder.embeddings.word_embeddings.weight, 0,
-                    0.02)
-            normal_(self.text_encoder.embeddings.position_embeddings.weight, 0,
-                    0.02)
+            normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02)
+            normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02)
         # init unet
         if not self.unet_is_pretrained:
             reset_initialized_parameter(self.unet)
@@ -243,16 +243,15 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
             if self.noise_offset:
                 # https://www.crosslabs.org//blog/diffusion-with-offset-noise
                 noise += self.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1),
-                    dtype=noise.dtype)
-            timesteps = paddle.randint(0,
-                                       self.noise_scheduler.num_train_timesteps,
-                                       (latents.shape[0], )).astype("int64")
+                    (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype
+                )
+            timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype(
+                "int64"
+            )
             noisy_latents = self.add_noise(latents, noise, timesteps)
 
         encoder_hidden_states = self.text_encoder(input_ids)[0]
-        noise_pred = self.unet(noisy_latents, timesteps,
-                               encoder_hidden_states).sample
+        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
         # Get the target for loss depending on the prediction type
         if self.prediction_type == "epsilon":
@@ -262,10 +261,7 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
         else:
             raise ValueError(f"Unknown prediction type {self.prediction_type}")
 
-        loss = (F.mse_loss(
-            noise_pred.cast("float32"),
-            target.cast("float32"),
-            reduction="none").mean([1, 2, 3]).mean())
+        loss = F.mse_loss(noise_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean()
 
         return loss
 
@@ -282,19 +278,18 @@ def decode_image(self, pixel_values=None, **kwargs):
 
     @paddle.no_grad()
     def log_image(
-            self,
-            input_ids=None,
-            height=256,
-            width=256,
-            eta=0.0,
-            guidance_scale=7.5,
-            **kwargs, ):
+        self,
+        input_ids=None,
+        height=256,
+        width=256,
+        eta=0.0,
+        guidance_scale=7.5,
+        **kwargs,
+    ):
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log 8 image
             if input_ids.shape[0] > 8:
                 input_ids = input_ids[:8]
@@ -308,43 +303,34 @@ def log_image(
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
                 uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings], axis=0)
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
 
-            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels,
-                                    height // 8, width // 8))
+            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
             # ddim donot use this
             latents = latents * self.eval_scheduler.init_noise_sigma
 
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
 
             for t in self.eval_scheduler.timesteps:
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             latents = 1 / 0.18215 * latents
             image = self.vae.decode(latents).sample
@@ -356,12 +342,10 @@ def fn(layer):
             # ldmbert
             if hasattr(layer, "enable_recompute"):
                 layer.enable_recompute = value
-                print("Set", layer.__class__, "recompute",
-                      layer.enable_recompute)
+                print("Set", layer.__class__, "recompute", layer.enable_recompute)
             # unet
             if hasattr(layer, "gradient_checkpointing"):
                 layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute",
-                      layer.gradient_checkpointing)
+                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
 
         self.apply(fn)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
index 82d71e6c5f816..b41f0b799469f 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
@@ -46,8 +46,7 @@ def parse_src(filename):
         elif data_source == "laion_aes":
             text_json = json.loads(vec[2])
             img_b64 = vec[5]
-            caption = text_json.get("caption_en",
-                                    text_json.get("blip_caption_en", ""))
+            caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
         else:
             _, captions, _, _, _, img_b64 = vec[:6]
             caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
@@ -77,23 +76,26 @@ def _get_param(self, img, output_size):
 
 class TextImagePair(IterableDataset):
     def __init__(
-            self,
-            file_list,
-            size,
-            num_records,
-            image_processing=None,
-            buffer_size=1000,
-            shuffle_every_n_samples=5,
-            interpolation="lanczos",
-            tokenizer=None, ):
+        self,
+        file_list,
+        size,
+        num_records,
+        image_processing=None,
+        buffer_size=1000,
+        shuffle_every_n_samples=5,
+        interpolation="lanczos",
+        tokenizer=None,
+    ):
         self.size = size
         if image_processing is None:
-            self.image_processing = transforms.Compose([
-                transforms.Resize(int(size / 0.9), interpolation),
-                RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize(0.5, 0.5),
-            ])
+            self.image_processing = transforms.Compose(
+                [
+                    transforms.Resize(int(size / 0.9), interpolation),
+                    RandomCrop(size),
+                    transforms.ToTensor(),
+                    transforms.Normalize(0.5, 0.5),
+                ]
+            )
         else:
             self.image_processing = image_processing
         self.text_processing = lambda caption: tokenizer(
@@ -101,7 +103,8 @@ def __init__(
             padding="max_length",
             truncation=True,
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids[0]
+            return_tensors="pd",
+        ).input_ids[0]
         self.file_list = []
         file_weights = []
         with open(file_list, "r") as f:
@@ -122,19 +125,14 @@ def __init__(
             file_weights = file_weights / file_weight_sum
             print(f"sample weights of files: {file_weights}")
             self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate(
-                [[0.0], self.file_weights_cumsum])
+            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
         else:
             print("sample each file list with same probabiliy")
             self.file_weights_cumsum = None
 
         self.num_records = num_records
-        self.file_ids = [
-            np.arange(len(filelist)) for filelist in self.file_list
-        ]
-        print(
-            f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}"
-        )
+        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
+        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
         self.buffer_size = buffer_size
         self.shuffle_every_n_samples = shuffle_every_n_samples
 
@@ -143,9 +141,7 @@ def sample_loader(self, file_ids, filenames):
             random.shuffle(file_ids)
             for i in file_ids:
                 filename = filenames[i].strip("\n")
-                with gzip.open(filename,
-                               "rb") if filename.endswith(".gz") else open(
-                                   filename, "rb") as f:
+                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
                     # retry = 0
                     while True:
                         line = f.readline()
@@ -171,19 +167,14 @@ def sample_loader(self, file_ids, filenames):
                             if w < self.size or h < self.size:
                                 continue
                             yield {
-                                "pixel_values":
-                                self.image_processing(data["image"]),
-                                "input_ids":
-                                self.text_processing(data["caption"]),
+                                "pixel_values": self.image_processing(data["image"]),
+                                "input_ids": self.text_processing(data["caption"]),
                             }
 
     def random_load_from_multi_dataset(self):
-        print(
-            f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}"
-        )
+        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
         sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i]))
-            for i in range(len(self.file_ids))
+            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
         ]
 
         while True:
@@ -192,8 +183,7 @@ def random_load_from_multi_dataset(self):
             else:
                 rand_num = random.random()
                 for i in range(len(self.file_list)):
-                    if (self.file_weights_cumsum[i] <= rand_num <
-                            self.file_weights_cumsum[i + 1]):
+                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
                         break
                 sample_loader = sample_loader_per_dataset[i]
                 # debug
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
index c3249e9caca29..d3da3f1f9d187 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
@@ -26,10 +26,16 @@
     )
 from paddlenlp.transformers import BertTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         EulerAncestralDiscreteScheduler, LDMBertModel,
-                         LDMTextToImagePipeline, LMSDiscreteScheduler,
-                         PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LDMBertModel,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 
 paddle.set_device("cpu")
 
@@ -59,8 +65,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("emb_layers.1", "time_emb_proj")
         new_item = new_item.replace("skip_connection", "conv_shortcut")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -76,8 +81,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -119,8 +123,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -128,21 +131,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
     to them. It splits attention layers, and takes into account additional replacements
     that may arise.
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -150,13 +152,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
             query, key, value = old_tensor.split(channels // num_heads, dim=1)
 
             checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -167,8 +167,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -178,8 +177,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -206,25 +204,19 @@ def create_unet_diffusers_config(original_config):
     """
     unet_params = original_config.model.params.unet_config.params
 
-    block_out_channels = [
-        unet_params.model_channels * mult for mult in unet_params.channel_mult
-    ]
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnDownBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "DownBlock2D")
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnUpBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "UpBlock2D")
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
@@ -237,7 +229,8 @@ def create_unet_diffusers_config(original_config):
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=unet_params.num_heads, )
+        attention_head_dim=unet_params.num_heads,
+    )
 
     return config
 
@@ -261,14 +254,12 @@ def create_vae_diffusers_config(original_config):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks, )
+        layers_per_block=vae_params.num_res_blocks,
+    )
     return config
 
 
-def convert_ldm_unet_checkpoint(checkpoint,
-                                config,
-                                path=None,
-                                extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -289,8 +280,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
             for key in keys:
                 if key.startswith("model.diffusion_model"):
                     flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                        flat_ema_key)
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
         else:
             print(
                 "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -303,17 +293,12 @@ def convert_ldm_unet_checkpoint(checkpoint,
 
     new_checkpoint = {}
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
-        "time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
-        "time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
-        "time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
-        "time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict[
-        "input_blocks.0.0.weight"]
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
     new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -322,35 +307,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
     new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "input_blocks" in layer
-    })
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
         for layer_id in range(num_input_blocks)
     }
 
     # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "middle_block" in layer
-    })
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
     middle_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
         for layer_id in range(num_middle_blocks)
     }
 
     # Retrieves the keys for the output blocks only
-    num_output_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "output_blocks" in layer
-    })
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
     output_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
         for layer_id in range(num_output_blocks)
     }
 
@@ -359,21 +332,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
 
         resnets = [
-            key for key in input_blocks[i]
-            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
-            key
-        ]
-        attentions = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
         ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
 
         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.weight")
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.bias")
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
 
         paths = renew_resnet_paths(resnets)
         meta_path = {
@@ -385,7 +354,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
             new_checkpoint,
             unet_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
@@ -398,19 +368,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
     resnet_1 = middle_blocks[2]
 
     resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(
-        resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
 
     resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(
-        resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
     meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -419,14 +388,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
         new_checkpoint,
         unet_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
 
     for i in range(num_output_blocks):
         block_id = i // (config["layers_per_block"] + 1)
         layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [
-            shave_segments(name, 2) for name in output_blocks[i]
-        ]
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
@@ -437,12 +405,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 output_block_list[layer_id] = [layer_name]
 
         if len(output_block_list) > 1:
-            resnets = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
-            ]
-            attentions = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
-            ]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
 
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
@@ -456,17 +420,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
             if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(
-                    ["conv.weight", "conv.bias"])
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.weight"]
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.bias"]
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -476,27 +440,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 paths = renew_attention_paths(attentions)
                 meta_path = {
                     "old": f"output_blocks.{i}.1",
-                    "new":
-                    f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                 }
                 assign_to_checkpoint(
                     paths,
                     new_checkpoint,
                     unet_state_dict,
                     additional_replacements=[meta_path],
-                    config=config, )
+                    config=config,
+                )
         else:
-            resnet_0_paths = renew_resnet_paths(
-                output_block_layers, n_shave_prefix_segments=1)
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
                 old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join([
-                    "up_blocks",
-                    str(block_id),
-                    "resnets",
-                    str(layer_in_block_id),
-                    path["new"],
-                ])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
@@ -514,107 +479,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -622,58 +554,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -681,14 +605,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint,
-                                              dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -741,7 +664,8 @@ def create_ldm_bert_config(original_config):
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        pad_token_id=0, )
+        pad_token_id=0,
+    )
     return config
 
 
@@ -755,61 +679,56 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
             bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
 
     new_checkpoint = {}
-    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[
-        "transformer.token_emb.weight"].numpy()
-    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[
-        "transformer.pos_emb.emb.weight"].numpy()
+    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"].numpy()
+    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"].numpy()
     for i in range(config["encoder_layers"]):
         double_i = 2 * i
         double_i_plus1 = 2 * i + 1
         # convert norm
         new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.weight"].numpy()
+            f"transformer.attn_layers.layers.{double_i}.0.weight"
+        ].numpy()
         new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.bias"].numpy()
+            f"transformer.attn_layers.layers.{double_i}.0.bias"
+        ].numpy()
 
         new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
-            bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t()
-            .numpy())
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t().numpy()
+        )
         new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
-            bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t()
-            .numpy())
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t().numpy()
+        )
         new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
-            bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t()
-            .numpy())
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t().numpy()
+        )
         new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
-            bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"]
-            .t().numpy())
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"].numpy(
-                )
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].t().numpy()
+        )
+        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+        ].numpy()
 
         new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"].numpy()
+            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+        ].numpy()
         new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"].numpy()
-        new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = (bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"]
-                                                                .t().numpy())
+            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+        ].numpy()
+        new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = (
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].t().numpy()
+        )
         new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"].numpy(
-            )
-        new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = (bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"]
-                                                                .t().numpy())
-        new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = (bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t()
-                                                              .numpy())
-
-    new_checkpoint["final_layer_norm.weight"] = bert_state_dict[
-        "transformer.norm.weight"].numpy()
-    new_checkpoint["final_layer_norm.bias"] = bert_state_dict[
-        "transformer.norm.bias"].numpy()
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+        ].numpy()
+        new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = (
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].t().numpy()
+        )
+        new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = (
+            bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t().numpy()
+        )
+
+    new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"].numpy()
+    new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"].numpy()
 
     return new_checkpoint
 
@@ -822,7 +741,8 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     # wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/configs/latent-diffusion/txt2img-1p4B-eval.yaml
     parser.add_argument(
         "--original_config_file",
@@ -844,13 +764,15 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
             " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
+        help="Path to the output model.",
+    )
 
     args = parser.parse_args()
 
@@ -871,46 +793,40 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
         checkpoint,
         diffusers_unet_config,
         path=args.checkpoint_path,
-        extract_ema=args.extract_ema, )
+        extract_ema=args.extract_ema,
+    )
     unet = UNet2DConditionModel(**diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        unet, diffusers_unet_checkpoint)
+    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
     check_keys(unet, ppdiffusers_unet_checkpoint)
     unet.load_dict(ppdiffusers_unet_checkpoint)
 
     # 2. Convert the VAE model.
     vae_config = create_vae_diffusers_config(original_config)
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
-                                                          vae_config)
+    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     vae = AutoencoderKL(**vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        vae, diffusers_vae_checkpoint)
+    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
     check_keys(vae, ppdiffusers_vae_checkpoint)
     vae.load_dict(ppdiffusers_vae_checkpoint)
 
     # 3. Convert the text model.
-    text_model_type = original_config.model.params.cond_stage_config.target.split(
-        ".")[-1]
+    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
 
     if text_model_type != "BERTEmbedder":
         print("We only support BERTEmbedder as text_encoder!")
 
     # 4. Convert the Bert model.
     bert_config = create_ldm_bert_config(original_config)
-    ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint,
-                                                                  bert_config)
+    ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint, bert_config)
     bert = LDMBertModel(**bert_config)
     check_keys(bert, ppdiffusers_bert_checkpoint)
     bert.load_dict(ppdiffusers_bert_checkpoint)
 
     # 5. Convert tokenizer.
     tokenizer = BertTokenizer.from_pretrained(
-        "bert-base-uncased",
-        model_max_length=bert_config["max_position_embeddings"])
+        "bert-base-uncased", model_max_length=bert_config["max_position_embeddings"]
+    )
     if tokenizer.vocab_size != bert_config["vocab_size"]:
-        print(
-            "Vocab size mismatched! Please verify your tokenizer or text encoder!"
-        )
+        print("Vocab size mismatched! Please verify your tokenizer or text encoder!")
 
     # 6. Convert scheduler.
     num_train_timesteps = original_config.model.params.timesteps
@@ -925,17 +841,14 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif args.scheduler_type == "euler-ancestral":
         scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
     elif args.scheduler_type == "ddim":
         scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -944,16 +857,11 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
-        raise ValueError(
-            f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    pipe = LDMTextToImagePipeline(
-        vqvae=vae,
-        bert=bert,
-        tokenizer=tokenizer,
-        unet=unet,
-        scheduler=scheduler)
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    pipe = LDMTextToImagePipeline(vqvae=vae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
 
     pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
index de9f15339690a..f9e742d3942f6 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
@@ -63,15 +63,13 @@
         # loop over resnets/attentions for downblocks
         hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
         sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
-        unet_conversion_map_layer.append(
-            (sd_down_res_prefix, hf_down_res_prefix))
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
 
         if i < 3:
             # no attention layers in down_blocks.3
             hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
             sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
-            unet_conversion_map_layer.append(
-                (sd_down_atn_prefix, hf_down_atn_prefix))
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
 
     for j in range(3):
         # loop over resnets/attentions for upblocks
@@ -83,21 +81,18 @@
             # no attention layers in up_blocks.0
             hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
             sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
-            unet_conversion_map_layer.append(
-                (sd_up_atn_prefix, hf_up_atn_prefix))
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
 
     if i < 3:
         # no downsample in down_blocks.3
         hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
         sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
-        unet_conversion_map_layer.append(
-            (sd_downsample_prefix, hf_downsample_prefix))
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
 
         # no upsample in up_blocks.3
         hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
         sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
-        unet_conversion_map_layer.append(
-            (sd_upsample_prefix, hf_upsample_prefix))
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
 
 hf_mid_atn_prefix = "mid_block.attentions.0."
 sd_mid_atn_prefix = "middle_block.1."
@@ -211,8 +206,7 @@ def convert_vae_state_dict(vae_state_dict):
 # pretty much a no-op
 
 
-def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet,
-                                              ppdiffusers_vae_unet_checkpoint):
+def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet, ppdiffusers_vae_unet_checkpoint):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -228,56 +222,63 @@ def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet,
 
 def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
     ppdiffusers_mapping_to_orig = {}
+    ppdiffusers_mapping_to_orig["embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight"
     ppdiffusers_mapping_to_orig[
-        "embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight"
-    ppdiffusers_mapping_to_orig[
-        "embeddings.position_embeddings.weight"] = "cond_stage_model.transformer.pos_emb.emb.weight"
+        "embeddings.position_embeddings.weight"
+    ] = "cond_stage_model.transformer.pos_emb.emb.weight"
     for i in range(num_layers):
         double_i = 2 * i
         double_i_plus1 = 2 * i + 1
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm1.weight"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight"
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm1.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias"
-
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
-                f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight",
-                "transpose", )
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
-                f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight",
-                "transpose", )
+            f"encoder.layers.{i}.norm1.weight"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight"
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
-                f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight",
-                "transpose", )
+            f"encoder.layers.{i}.norm1.bias"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias"
+
+        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
+            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight",
+            "transpose",
+        )
+        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
+            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight",
+            "transpose",
+        )
+        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
+            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight",
+            "transpose",
+        )
+        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
+            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight",
+            "transpose",
+        )
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
-                f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight",
-                "transpose", )
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.self_attn.out_proj.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+            f"encoder.layers.{i}.self_attn.out_proj.bias"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias"
 
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm2.weight"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+            f"encoder.layers.{i}.norm2.weight"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight"
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm2.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+            f"encoder.layers.{i}.norm2.bias"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias"
         ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear1.weight"] = (
             f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight",
-            "transpose", )
+            "transpose",
+        )
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.linear1.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+            f"encoder.layers.{i}.linear1.bias"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
         ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear2.weight"] = (
             f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight",
-            "transpose", )
+            "transpose",
+        )
         ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.linear2.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
+            f"encoder.layers.{i}.linear2.bias"
+        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
 
-    ppdiffusers_mapping_to_orig[
-        "final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight"
-    ppdiffusers_mapping_to_orig[
-        "final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias"
+    ppdiffusers_mapping_to_orig["final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight"
+    ppdiffusers_mapping_to_orig["final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias"
 
     new_state_dict = {}
     for k, v in ldmbert_state_dict.items():
@@ -286,18 +287,15 @@ def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
         if isinstance(new_name, (list, tuple)):
             need_transpose = True
             new_name = new_name[0]
-        new_state_dict[new_name] = (torch.from_numpy(v.t().numpy())
-                                    if need_transpose else
-                                    torch.from_numpy(v.numpy()))
+        new_state_dict[new_name] = torch.from_numpy(v.t().numpy()) if need_transpose else torch.from_numpy(v.numpy())
 
     # dummpy weights, we donot use this!
-    new_state_dict[
-        "cond_stage_model.transformer.to_logits.weight"] = torch.zeros(
-            new_state_dict[
-                "cond_stage_model.transformer.token_emb.weight"].shape)
+    new_state_dict["cond_stage_model.transformer.to_logits.weight"] = torch.zeros(
+        new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape
+    )
     new_state_dict["cond_stage_model.transformer.to_logits.bias"] = torch.zeros(
-        new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[
-            0])
+        new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[0]
+    )
     return new_state_dict
 
 
@@ -308,43 +306,35 @@ def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
         default=None,
         type=str,
         required=True,
-        help="Path to the model to convert.", )
+        help="Path to the model to convert.",
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
-    parser.add_argument(
-        "--half", action="store_true", help="Save weights in half precision.")
+        help="Path to the output model.",
+    )
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
 
     args = parser.parse_args()
     pipe = LDMTextToImagePipeline.from_pretrained(args.model_name_or_path)
 
     # Convert the UNet model
-    unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(
-        pipe.unet, pipe.unet.state_dict())
+    unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.unet, pipe.unet.state_dict())
     unet_state_dict = convert_unet_state_dict(unet_state_dict)
-    unet_state_dict = {
-        "model.diffusion_model." + k: v
-        for k, v in unet_state_dict.items()
-    }
+    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
 
     # Convert the VAE model
-    vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(
-        pipe.vqvae, pipe.vqvae.state_dict())
+    vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.vqvae, pipe.vqvae.state_dict())
     vae_state_dict = convert_vae_state_dict(vae_state_dict)
-    vae_state_dict = {
-        "first_stage_model." + k: v
-        for k, v in vae_state_dict.items()
-    }
+    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
 
     # Convert the ldmbert model
-    text_enc_dict = convert_ldmbert_state_dict(
-        pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"])
+    text_enc_dict = convert_ldmbert_state_dict(pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"])
 
     # Put together new checkpoint
-    state_dict = { ** unet_state_dict, ** vae_state_dict, ** text_enc_dict}
+    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
     if args.half:
         state_dict = {k: v.half() for k, v in state_dict.items()}
     state_dict = {"state_dict": state_dict}
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
index 3ab76ea0ffc2b..6890fae514ab5 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
@@ -41,7 +41,8 @@
     linewidth=3,
     color="r",
     marker="o",
-    markerfacecolor="blue", )
+    markerfacecolor="blue",
+)
 plt.plot(
     clip_pt,
     fid_pt,
@@ -49,7 +50,8 @@
     linewidth=3,
     color="b",
     marker="o",
-    markerfacecolor="red", )
+    markerfacecolor="red",
+)
 plt.xlabel("CLIP Score")
 plt.ylabel("FID@1k")
 plt.title("12W Globel Step Pareto Curves - DDIM")
diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
index 15352e4cd1d5b..4aa3163536c16 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
@@ -21,8 +21,14 @@
 
 import paddle
 import paddle.nn as nn
-from ldm import (DataArguments, LatentDiffusionModel, ModelArguments,
-                 NoTrainerTrainingArguments, TextImagePair, worker_init_fn)
+from ldm import (
+    DataArguments,
+    LatentDiffusionModel,
+    ModelArguments,
+    NoTrainerTrainingArguments,
+    TextImagePair,
+    worker_init_fn,
+)
 from paddle.io import DataLoader
 from paddle.optimizer import AdamW
 from paddlenlp.trainer import PdArgumentParser, set_seed
@@ -47,12 +53,11 @@ def get_writer(training_args):
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, NoTrainerTrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, NoTrainerTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     training_args.image_logging_steps = model_args.image_logging_steps = (
-        math.ceil(model_args.image_logging_steps / training_args.logging_steps)
-        * training_args.logging_steps)
+        math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
+    )
     training_args.resolution = data_args.resolution
     training_args.print_config(training_args, "Training")
     training_args.print_config(model_args, "Model")
@@ -64,8 +69,7 @@ def main():
     if num_processes > 1:
         paddle.distributed.init_parallel_env()
 
-    training_args.logging_dir = os.path.join(training_args.output_dir,
-                                             training_args.logging_dir)
+    training_args.logging_dir = os.path.join(training_args.output_dir, training_args.logging_dir)
 
     if training_args.seed is not None:
         set_seed(training_args.seed)
@@ -75,16 +79,14 @@ def main():
 
     model = LatentDiffusionModel(model_args)
     model.set_recompute(training_args.recompute)
-    params_to_train = itertools.chain(model.text_encoder.parameters(),
-                                      model.unet.parameters())
+    params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters())
 
     lr_scheduler = get_scheduler(
         training_args.lr_scheduler_type,
         learning_rate=training_args.learning_rate,
-        num_warmup_steps=training_args.warmup_steps *
-        training_args.gradient_accumulation_steps,
-        num_training_steps=training_args.max_steps *
-        training_args.gradient_accumulation_steps, )
+        num_warmup_steps=training_args.warmup_steps * training_args.gradient_accumulation_steps,
+        num_training_steps=training_args.max_steps * training_args.gradient_accumulation_steps,
+    )
 
     optimizer = AdamW(
         learning_rate=lr_scheduler,
@@ -94,8 +96,9 @@ def main():
         weight_decay=training_args.weight_decay,
         epsilon=training_args.adam_epsilon,
         grad_clip=nn.ClipGradByGlobalNorm(training_args.max_grad_norm)
-        if training_args.max_grad_norm is not None and
-        training_args.max_grad_norm > 0 else None, )
+        if training_args.max_grad_norm is not None and training_args.max_grad_norm > 0
+        else None,
+    )
     train_dataset = TextImagePair(
         file_list=data_args.file_list,
         size=data_args.resolution,
@@ -103,7 +106,8 @@ def main():
         buffer_size=data_args.buffer_size,
         shuffle_every_n_samples=data_args.shuffle_every_n_samples,
         interpolation="lanczos",
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     if num_processes > 1:
         model = paddle.DataParallel(model)
@@ -112,28 +116,23 @@ def main():
         train_dataset,
         batch_size=training_args.per_device_train_batch_size,
         num_workers=training_args.dataloader_num_workers,
-        worker_init_fn=worker_init_fn, )
+        worker_init_fn=worker_init_fn,
+    )
 
     if rank == 0:
         writer = get_writer(training_args)
 
     # Train!
-    total_batch_size = (training_args.per_device_train_batch_size *
-                        num_processes *
-                        training_args.gradient_accumulation_steps)
+    total_batch_size = (
+        training_args.per_device_train_batch_size * num_processes * training_args.gradient_accumulation_steps
+    )
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
-    )
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {training_args.gradient_accumulation_steps}"
-    )
+    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {training_args.gradient_accumulation_steps}")
 
     global_steps = 0
     tic_train = time.time()
@@ -144,15 +143,13 @@ def main():
             break
 
         for step, batch in enumerate(train_dataloader):
-            if (num_processes > 1 and (
-                (step + 1) % training_args.gradient_accumulation_steps != 0)
-                ) or training_args.recompute:
+            if (
+                num_processes > 1 and ((step + 1) % training_args.gradient_accumulation_steps != 0)
+            ) or training_args.recompute:
                 # grad acc, no_sync when (step + 1) % training_args.gradient_accumulation_steps != 0:
                 ctx_manager = model.no_sync()
             else:
-                ctx_manager = (contextlib.nullcontext()
-                               if sys.version_info >= (3, 7) else
-                               contextlib.suppress())
+                ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
             with ctx_manager:
                 loss = model(**batch)
@@ -170,8 +167,7 @@ def main():
                 # train log
                 if global_steps % training_args.logging_steps == 0:
                     logs = {
-                        "train/loss":
-                        loss.item() * training_args.gradient_accumulation_steps,
+                        "train/loss": loss.item() * training_args.gradient_accumulation_steps,
                         "train/lr_abs": lr_scheduler.get_lr(),
                         "train/global_steps": global_steps,
                     }
@@ -191,48 +187,51 @@ def main():
                     logger.info(log_str)
 
                     if global_steps % training_args.image_logging_steps == 0:
-                        reconstruction_img = unwrap_model(model).decode_image(
-                            pixel_values=batch["pixel_values"])
-                        ddim_10_img = unwrap_model(model).log_image(
-                            input_ids=batch["input_ids"], guidance_scale=1.0)
-                        ddim_75_img = unwrap_model(model).log_image(
-                            input_ids=batch["input_ids"], guidance_scale=7.5)
+                        reconstruction_img = unwrap_model(model).decode_image(pixel_values=batch["pixel_values"])
+                        ddim_10_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=1.0)
+                        ddim_75_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=7.5)
                         if rank == 0:
                             writer.add_image(
                                 "reconstruction",
                                 reconstruction_img,
                                 global_steps,
-                                dataformats="NHWC", )
+                                dataformats="NHWC",
+                            )
                             writer.add_image(
                                 "ddim-samples-1.0",
                                 ddim_10_img,
                                 global_steps,
-                                dataformats="NHWC", )
+                                dataformats="NHWC",
+                            )
                             writer.add_image(
                                 "ddim-samples-7.5",
                                 ddim_75_img,
                                 global_steps,
-                                dataformats="NHWC", )
+                                dataformats="NHWC",
+                            )
                     tic_train = time.time()
 
                     if rank == 0 and global_steps % training_args.save_steps == 0:
                         os.makedirs(
-                            os.path.join(training_args.output_dir,
-                                         f"global-steps-{global_steps}"),
-                            exist_ok=True, )
+                            os.path.join(training_args.output_dir, f"global-steps-{global_steps}"),
+                            exist_ok=True,
+                        )
                         paddle.save(
                             model.state_dict(),
                             os.path.join(
                                 training_args.output_dir,
                                 f"global-steps-{global_steps}",
-                                "model_state.pdparams", ), )
+                                "model_state.pdparams",
+                            ),
+                        )
 
                 if global_steps >= training_args.max_steps:
                     break
     if rank == 0:
         paddle.save(
             model.state_dict(),
-            os.path.join(training_args.output_dir, "model_state.pdparams"), )
+            os.path.join(training_args.output_dir, "model_state.pdparams"),
+        )
         writer.close()
 
 
diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
index d0464a661998f..0125d6fc27e9d 100644
--- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
+++ b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
@@ -16,16 +16,19 @@
 import os
 
 import paddle
-from ldm import (DataArguments, LatentDiffusionModel, LatentDiffusionTrainer,
-                 ModelArguments, TextImagePair)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from ldm import (
+    DataArguments,
+    LatentDiffusionModel,
+    LatentDiffusionTrainer,
+    ModelArguments,
+    TextImagePair,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
 from paddlenlp.utils.log import logger
 
 
 def main():
-    parser = PdArgumentParser(
-        (ModelArguments, DataArguments, TrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # report to custom_visualdl
     training_args.report_to = ["custom_visualdl"]
@@ -33,9 +36,10 @@ def main():
     training_args.benchmark = model_args.benchmark
     training_args.profiler_options = model_args.profiler_options
     training_args.image_logging_steps = model_args.image_logging_steps = (
-        (math.ceil(model_args.image_logging_steps / training_args.logging_steps)
-         * training_args.logging_steps)
-        if model_args.image_logging_steps > 0 else -1)
+        (math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
+        if model_args.image_logging_steps > 0
+        else -1
+    )
 
     training_args.print_config(model_args, "Model")
     training_args.print_config(data_args, "Data")
@@ -44,16 +48,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -67,32 +69,30 @@ def main():
         buffer_size=data_args.buffer_size,
         shuffle_every_n_samples=data_args.shuffle_every_n_samples,
         interpolation="lanczos",
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     if model_args.to_static:
-        input_ids = paddle.static.InputSpec(
-            name="input_ids",
-            shape=[-1, model_args.model_max_length],
-            dtype="int64")
+        input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64")
         pixel_values = paddle.static.InputSpec(
             name="pixel_values",
             shape=[-1, 3, data_args.resolution, data_args.resolution],
-            dtype="float32", )
+            dtype="float32",
+        )
         specs = [input_ids, pixel_values]
         paddle.jit.ignore_module([os])
         model = paddle.jit.to_static(model, input_spec=specs)
-        logger.info("Successfully to apply @to_static with specs: {}".format(
-            specs))
+        logger.info("Successfully to apply @to_static with specs: {}".format(specs))
 
     trainer = LatentDiffusionTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
     # must set recompute after trainer init
     trainer.model.set_recompute(training_args.recompute)
-    params_to_train = itertools.chain(trainer.model.text_encoder.parameters(),
-                                      trainer.model.unet.parameters())
+    params_to_train = itertools.chain(trainer.model.text_encoder.parameters(), trainer.model.unet.parameters())
     trainer.set_optimizer_grouped_parameters(params_to_train)
 
     checkpoint = None
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py
index e94c83d4ee0af..14468dc73417a 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py
@@ -30,16 +30,13 @@ def crop(clip, i, j, h, w):
     """
     if len(clip.shape) != 4:
         raise ValueError("clip should be a 4D tensor")
-    return clip[(...), i:i + h, j:j + w]
+    return clip[(...), i : i + h, j : j + w]
 
 
 def resize(clip, target_size, interpolation_mode):
     if len(target_size) != 2:
-        raise ValueError(
-            f"target size should be tuple (height, width), instead got {target_size}"
-        )
-    return paddle.nn.functional.interpolate(
-        x=clip, size=target_size, mode=interpolation_mode, align_corners=False)
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return paddle.nn.functional.interpolate(x=clip, size=target_size, mode=interpolation_mode, align_corners=False)
 
 
 def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
@@ -85,8 +82,7 @@ def to_tensor(clip):
     """
     _is_tensor_video_clip(clip)
     if not clip.dtype == "uint8":
-        raise TypeError("clip tensor should have data type uint8. Got %s" %
-                        str(clip.dtype))
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
     return clip.astype(dtype="float32").transpose(perm=[3, 0, 1, 2]) / 255.0
 
 
@@ -105,8 +101,7 @@ def normalize(clip, mean, std, inplace=False):
         clip = clip.clone()
     mean = paddle.to_tensor(data=mean, place=clip.place).astype(clip.dtype)
     std = paddle.to_tensor(data=std, place=clip.place).astype(clip.dtype)
-    clip = clip.substract(mean[:, (None), (None), (None)]).divide(std[:, (
-        None), (None), (None)])
+    clip = clip.substract(mean[:, (None), (None), (None)]).divide(std[:, (None), (None), (None)])
     return clip
 
 
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py
index aaaa301718d58..97b39c8cf8f86 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py
@@ -44,15 +44,15 @@ def __repr__(self) -> str:
 
 class RandomResizedCropVideo(paddle.vision.transforms.RandomResizedCrop):
     def __init__(
-            self,
-            size,
-            scale=(0.08, 1.0),
-            ratio=(3.0 / 4.0, 4.0 / 3.0),
-            interpolation_mode="bilinear", ):
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation_mode="bilinear",
+    ):
         if isinstance(size, tuple):
             if len(size) != 2:
-                raise ValueError(
-                    f"size should be tuple (height, width), instead got {size}")
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
             self.size = size
         else:
             self.size = size, size
@@ -69,8 +69,7 @@ def __call__(self, clip):
                 size is (C, T, H, W)
         """
         i, j, h, w = self.get_params(clip, self.scale, self.ratio)
-        return F.resized_crop(clip, i, j, h, w, self.size,
-                              self.interpolation_mode)
+        return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode)
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}, scale={self.scale}, ratio={self.ratio})"
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py
index c40a946bb1047..e2e940e51fc97 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py
@@ -21,6 +21,7 @@
 from PIL import Image, ImageFile
 
 from ._transforms_video import CenterCropVideo, RandomCropVideo
+
 """ VideoFrameDataset """
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 IMG_EXTENSIONS = [
@@ -72,9 +73,7 @@ def is_image_file(filename):
 
 def find_classes(dir):
     assert os.path.exists(dir), f"{dir} does not exist"
-    classes = [
-        d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))
-    ]
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
     classes.sort()
     class_to_idx = {classes[i]: i for i in range(len(classes))}
     return classes, class_to_idx
@@ -87,10 +86,7 @@ def class_name_to_idx(annotation_dir):
     fpath = os.path.join(annotation_dir, "classInd.txt")
     with open(fpath, "r") as f:
         data = f.readlines()
-        class_to_idx = {
-            x.strip().split(" ")[1].lower(): int(x.strip().split(" ")[0]) - 1
-            for x in data
-        }
+        class_to_idx = {x.strip().split(" ")[1].lower(): int(x.strip().split(" ")[0]) - 1 for x in data}
     return class_to_idx
 
 
@@ -151,8 +147,7 @@ def split_by_captical(s):
     return string.rstrip(" ").lower()
 
 
-def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1,
-                     clip_step=None):
+def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1, clip_step=None):
     """
     Load consecutive clips and consecutive frames from `dir`.
 
@@ -181,11 +176,9 @@ def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1,
             assert os.path.isdir(video_path)
             frames = []
             for i, fname in enumerate(sorted(os.listdir(video_path))):
-                assert is_image_file(
-                    fname), f"fname={fname},video_path={video_path},dir={dir}"
+                assert is_image_file(fname), f"fname={fname},video_path={video_path},dir={dir}"
                 img_path = os.path.join(video_path, fname)
-                class_name = video_name.split("_")[
-                    1].lower()  # v_BoxingSpeedBag_g12_c05 -> boxingspeedbag
+                class_name = video_name.split("_")[1].lower()  # v_BoxingSpeedBag_g12_c05 -> boxingspeedbag
                 class_caption = split_by_captical(
                     video_name.split("_")[1]
                 )  # v_BoxingSpeedBag_g12_c05 -> BoxingSpeedBag -> boxing speed bag
@@ -201,7 +194,7 @@ def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1,
             frames = frames[::frame_stride]
             start_indices = list(range(len(frames)))[::clip_step]
             for i in start_indices:
-                clip = frames[i:i + nframes]
+                clip = frames[i : i + nframes]
                 if len(clip) == nframes:
                     clips.append(clip)
     return clips, videos
@@ -234,18 +227,19 @@ def load_and_transform_frames(frame_list, loader, img_transform=None):
 
 class VideoFrameDataset(paddle.io.Dataset):
     def __init__(
-            self,
-            data_root,
-            resolution,
-            video_length,
-            dataset_name="",
-            subset_split="",
-            annotation_dir=None,
-            spatial_transform="",
-            temporal_transform="",
-            frame_stride=1,
-            clip_step=None,
-            tokenizer=None, ):
+        self,
+        data_root,
+        resolution,
+        video_length,
+        dataset_name="",
+        subset_split="",
+        annotation_dir=None,
+        spatial_transform="",
+        temporal_transform="",
+        frame_stride=1,
+        clip_step=None,
+        tokenizer=None,
+    ):
         self.loader = default_loader
         self.video_length = video_length
         self.subset_split = subset_split
@@ -264,8 +258,7 @@ def __init__(
             if annotation_dir is None:
                 annotation_dir = os.path.join(data_root, "ucfTrainTestlist")
             class_to_idx = class_name_to_idx(annotation_dir)
-            assert (len(class_to_idx) == 101
-                    ), f"num of classes = {len(class_to_idx)}, not 101"
+            assert len(class_to_idx) == 101, f"num of classes = {len(class_to_idx)}, not 101"
         elif dataset_name == "sky":
             classes, class_to_idx = find_classes(video_dir)
         else:
@@ -279,9 +272,9 @@ def __init__(
             video_length,
             class_to_idx,
             frame_stride=frame_stride,
-            clip_step=clip_step, )
-        assert (len(self.clips[0]) == video_length
-                ), f"Invalid clip length = {len(self.clips[0])}"
+            clip_step=clip_step,
+        )
+        assert len(self.clips[0]) == video_length, f"Invalid clip length = {len(self.clips[0])}"
         if self.temporal_transform == "rand_clips":
             self.clips = self.videos
         if subset_split == "all":
@@ -296,31 +289,33 @@ def __init__(
         print("[VideoFrameDataset] video_length", self.video_length)
         if len(self.clips) == 0:
             raise RuntimeError(
-                f"Found 0 clips in {video_dir}. \nSupported image extensions are: "
-                + ",".join(IMG_EXTENSIONS))
-        self.img_transform = paddle.vision.transforms.Compose([
-            paddle.vision.transforms.ToTensor(),
-            paddle.vision.transforms.Normalize((0.5, 0.5, 0.5),
-                                               (0.5, 0.5, 0.5)),
-        ])
+                f"Found 0 clips in {video_dir}. \nSupported image extensions are: " + ",".join(IMG_EXTENSIONS)
+            )
+        self.img_transform = paddle.vision.transforms.Compose(
+            [
+                paddle.vision.transforms.ToTensor(),
+                paddle.vision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+            ]
+        )
         if self.spatial_transform == "center_crop_resize":
             print("Spatial transform: center crop and then resize")
-            self.video_transform = paddle.vision.transforms.Compose([
-                paddle.vision.transforms.Resize(resolution),
-                CenterCropVideo(resolution),
-            ])
-            self.video_transform_step1 = paddle.vision.transforms.Compose([
-                paddle.vision.transforms.Resize(resolution),
-            ])
-            self.video_transform_step2 = paddle.vision.transforms.Compose(
-                [CenterCropVideo(resolution)])
+            self.video_transform = paddle.vision.transforms.Compose(
+                [
+                    paddle.vision.transforms.Resize(resolution),
+                    CenterCropVideo(resolution),
+                ]
+            )
+            self.video_transform_step1 = paddle.vision.transforms.Compose(
+                [
+                    paddle.vision.transforms.Resize(resolution),
+                ]
+            )
+            self.video_transform_step2 = paddle.vision.transforms.Compose([CenterCropVideo(resolution)])
         elif self.spatial_transform == "resize":
             print("Spatial transform: resize with no crop")
-            self.video_transform = paddle.vision.transforms.Resize(
-                (resolution, resolution))
+            self.video_transform = paddle.vision.transforms.Resize((resolution, resolution))
         elif self.spatial_transform == "random_crop":
-            self.video_transform = paddle.vision.transforms.Compose(
-                [RandomCropVideo(resolution)])
+            self.video_transform = paddle.vision.transforms.Compose([RandomCropVideo(resolution)])
         elif self.spatial_transform == "":
             self.video_transform = None
         else:
@@ -332,7 +327,8 @@ def __init__(
                 padding="max_length",
                 truncation=True,
                 max_length=tokenizer.model_max_length,
-                return_tensors="np", ).input_ids[0]
+                return_tensors="np",
+            ).input_ids[0]
         else:
             self.text_processing = None
 
@@ -340,14 +336,13 @@ def __getitem__(self, index):
         if self.temporal_transform == "rand_clips":
             raw_video = self.clips[index]
             rand_idx = random.randint(0, len(raw_video) - self.video_length)
-            clip = raw_video[rand_idx:rand_idx + self.video_length]
+            clip = raw_video[rand_idx : rand_idx + self.video_length]
         else:
             clip = self.clips[index]
         assert (
             len(clip) == self.video_length
         ), f"current clip_length={len(clip)}, target clip_length={self.video_length}, {clip}"
-        frames, labels = load_and_transform_frames(clip, self.loader,
-                                                   self.img_transform)
+        frames, labels = load_and_transform_frames(clip, self.loader, self.img_transform)
 
         assert (
             len(frames) == self.video_length
@@ -357,8 +352,7 @@ def __getitem__(self, index):
             if self.spatial_transform == "center_crop_resize":
                 temp_frames = rearrange(frames, "c t h w -> (c t) h w")
                 temp_frames = self.video_transform_step1(temp_frames)
-                frames = rearrange(
-                    temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
+                frames = rearrange(temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
                 frames = self.video_transform_step2(frames)
             else:
                 frames = self.video_transform(frames)
@@ -377,7 +371,9 @@ def __getitem__(self, index):
                 "input_ids": self.text_processing(example["caption"]),
             }
         else:
-            tensor_out = {"pixel_values": example["image"], }
+            tensor_out = {
+                "pixel_values": example["image"],
+            }
         return tensor_out
 
     def __len__(self):
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py
index a4aefa02a1008..e91a6f6018c21 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py
@@ -25,100 +25,82 @@ class ModelArguments:
     # for initialization
     task_type: str = field(
         default="short",
-        metadata={
-            "help":
-            "Type of train task. Should be one of ['short', 'text2video']"
-        }, )
+        metadata={"help": "Type of train task. Should be one of ['short', 'text2video']"},
+    )
     pretrained_model_name_or_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
     tokenizer_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained tokenizer name or path if not use pretrained model name or path"},
+    )
     vae_type: str = field(
         default="3d",
         metadata={"help": "Type of vae to use. Should be one of ['2d', '3d']"},
     )
     vae_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained vae name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained vae name or path if not use pretrained model name or path"},
+    )
     text_encoder_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained text encoder name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained text encoder name or path if not use pretrained model name or path"},
+    )
     text_encoder_config_file: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Text encoder config file if not use pretrained text encoder"
-        }, )
-    is_text_encoder_trainable: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+        metadata={"help": "Text encoder config file if not use pretrained text encoder"},
+    )
+    is_text_encoder_trainable: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     unet_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained unet name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained unet name or path if not use pretrained model name or path"},
+    )
     unet_config_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "Unet config file if not use pretrained unet"})
+        default=None, metadata={"help": "Unet config file if not use pretrained unet"}
+    )
     scheduler_beta_start: Optional[float] = field(
-        default=0.0015,
-        metadata={"help": "Train or eval scheduler beta start"})
-    scheduler_beta_end: Optional[float] = field(
-        default=0.0155, metadata={"help": "Train or eval scheduler beta end"})
+        default=0.0015, metadata={"help": "Train or eval scheduler beta start"}
+    )
+    scheduler_beta_end: Optional[float] = field(default=0.0155, metadata={"help": "Train or eval scheduler beta end"})
     scheduler_num_train_timesteps: Optional[int] = field(
         default=1000,
         metadata={"help": "Train or eval scheduler number of train timesteps"},
     )
     eval_scheduler_num_inference_steps: Optional[int] = field(
-        default=50,
-        metadata={"help": "Eval scheduler number of inference timesteps"})
+        default=50, metadata={"help": "Eval scheduler number of inference timesteps"}
+    )
     # for training
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable xformers memory efficient attention"})
+        default=False, metadata={"help": "enable xformers memory efficient attention"}
+    )
     scale_factor: Optional[float] = field(
         default=0.33422927,
-        metadata={"help": "The scale factor in the first stage encoding"}, )
+        metadata={"help": "The scale factor in the first stage encoding"},
+    )
     shift_factor: Optional[float] = field(
         default=1.4606637,
-        metadata={"help": "The shift factor in the first stage encoding"}, )
+        metadata={"help": "The shift factor in the first stage encoding"},
+    )
     loss_type: str = field(
         default="l1",
-        metadata={
-            "help":
-            "The loss type to use in training. Should be one of ['l2', 'l1']"
-        }, )
+        metadata={"help": "The loss type to use in training. Should be one of ['l2', 'l1']"},
+    )
     # for alignmemnt
     latents_path: str = field(
         default=None,
-        metadata={"help": "Path to latents, used for alignment"}, )
-    use_paddle_conv_init: bool = field(
-        default=False,
-        metadata={"help": "Whether or not use paddle conv2d init"})
+        metadata={"help": "Path to latents, used for alignment"},
+    )
+    use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init"})
     if_numpy_genarator_random_alignment: bool = field(
         default=False,
-        metadata={"help": "Whether to align random using numpy generator"}, )
+        metadata={"help": "Whether to align random using numpy generator"},
+    )
     numpy_genarator_random_seed: Optional[int] = field(
-        default=42, metadata={"help": "The random seed for numpy generator"})
-    set_seed_for_alignment: bool = field(
-        default=False,
-        metadata={"help": "Whether to set seed again for alignment"})
+        default=42, metadata={"help": "The random seed for numpy generator"}
+    )
+    set_seed_for_alignment: bool = field(default=False, metadata={"help": "Whether to set seed again for alignment"})
 
 
 @dataclass
@@ -128,8 +110,7 @@ class TrainerArguments:
     """
 
     # for log
-    image_logging_steps: Optional[int] = field(
-        default=1000, metadata={"help": "Log image every X steps."})
+    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
 
 
 @dataclass
@@ -140,28 +121,29 @@ class VideoFrameDatasetArguments:
 
     train_data_root: str = field(
         default="/root/data/lvdm/sky",
-        metadata={"help": "The root path of train dataset files"}, )
-    train_subset_split: str = field(
-        default="train", metadata={"help": "The train subset split"})
+        metadata={"help": "The root path of train dataset files"},
+    )
+    train_subset_split: str = field(default="train", metadata={"help": "The train subset split"})
     eval_data_root: str = field(
         default="/root/data/lvdm/sky",
-        metadata={"help": "The root path of validation dataset files"}, )
-    eval_subset_split: str = field(
-        default="train", metadata={"help": "The validation subset split"})
+        metadata={"help": "The root path of validation dataset files"},
+    )
+    eval_subset_split: str = field(default="train", metadata={"help": "The validation subset split"})
     resolution: int = field(
         default=256,
-        metadata={"help": "The resolution"}, )
+        metadata={"help": "The resolution"},
+    )
     video_length: int = field(
         default=16,
-        metadata={"help": "The video length"}, )
-    dataset_name: str = field(
-        default="sky", metadata={"help": "The dataset name"})
+        metadata={"help": "The video length"},
+    )
+    dataset_name: str = field(default="sky", metadata={"help": "The dataset name"})
     spatial_transform: str = field(
         default="center_crop_resize",
-        metadata={"help": "The spatial transform type to use"}, )
-    temporal_transform: str = field(
-        default="rand_clips",
-        metadata={"help": "The temporal transform type to use"})
+        metadata={"help": "The spatial transform type to use"},
+    )
+    temporal_transform: str = field(default="rand_clips", metadata={"help": "The temporal transform type to use"})
     clip_step: int = field(
         default=None,
-        metadata={"help": "The clip step"}, )
+        metadata={"help": "The clip step"},
+    )
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py
index feb46a5f5e3ad..39000183c6cce 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py
@@ -25,100 +25,79 @@ class ModelArguments:
     # for initialization
     task_type: str = field(
         default="text2video",
-        metadata={
-            "help":
-            "Type of train task. Should be one of ['short', 'text2video']"
-        }, )
+        metadata={"help": "Type of train task. Should be one of ['short', 'text2video']"},
+    )
     pretrained_model_name_or_path: str = field(
         default=None,
-        metadata={
-            "help":
-            "Path to pretrained model or model, when we want to resume training."
-        }, )
+        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
+    )
     tokenizer_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained tokenizer name or path if not use pretrained model name or path"},
+    )
     vae_type: str = field(
         default="2d",
         metadata={"help": "Type of vae to use. Should be one of ['2d', '3d']"},
     )
     vae_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained vae name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained vae name or path if not use pretrained model name or path"},
+    )
     text_encoder_name_or_path: Optional[str] = field(
         default="openai/clip-vit-large-patch14",
-        metadata={
-            "help":
-            "Pretrained text encoder name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained text encoder name or path if not use pretrained model name or path"},
+    )
     text_encoder_config_file: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Text encoder config file if not use pretrained text encoder"
-        }, )
-    is_text_encoder_trainable: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+        metadata={"help": "Text encoder config file if not use pretrained text encoder"},
+    )
+    is_text_encoder_trainable: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     unet_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help":
-            "Pretrained unet name or path if not use pretrained model name or path"
-        }, )
+        metadata={"help": "Pretrained unet name or path if not use pretrained model name or path"},
+    )
     unet_config_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "Unet config file if not use pretrained unet"})
+        default=None, metadata={"help": "Unet config file if not use pretrained unet"}
+    )
     scheduler_beta_start: Optional[float] = field(
-        default=0.00085,
-        metadata={"help": "Train or eval scheduler beta start"})
-    scheduler_beta_end: Optional[float] = field(
-        default=0.012, metadata={"help": "Train or eval scheduler beta end"})
+        default=0.00085, metadata={"help": "Train or eval scheduler beta start"}
+    )
+    scheduler_beta_end: Optional[float] = field(default=0.012, metadata={"help": "Train or eval scheduler beta end"})
     scheduler_num_train_timesteps: Optional[int] = field(
         default=1000,
         metadata={"help": "Train or eval scheduler number of train timesteps"},
     )
     eval_scheduler_num_inference_steps: Optional[int] = field(
-        default=50,
-        metadata={"help": "Eval scheduler number of inference timesteps"})
+        default=50, metadata={"help": "Eval scheduler number of inference timesteps"}
+    )
     # for training
-    use_ema: bool = field(
-        default=False, metadata={"help": "Whether or not use ema"})
+    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
     enable_xformers_memory_efficient_attention: bool = field(
-        default=False,
-        metadata={"help": "enable xformers memory efficient attention"})
+        default=False, metadata={"help": "enable xformers memory efficient attention"}
+    )
     scale_factor: Optional[float] = field(
         default=0.18215,
-        metadata={"help": "The scale factor in the first stage encoding"}, )
-    shift_factor: Optional[float] = field(
-        default=0,
-        metadata={"help": "The shift factor in the first stage encoding"})
+        metadata={"help": "The scale factor in the first stage encoding"},
+    )
+    shift_factor: Optional[float] = field(default=0, metadata={"help": "The shift factor in the first stage encoding"})
     loss_type: str = field(
         default="l2",
-        metadata={
-            "help":
-            "The loss type to use in training. Should be one of ['l2', 'l1']"
-        }, )
+        metadata={"help": "The loss type to use in training. Should be one of ['l2', 'l1']"},
+    )
     # for alignmemnt
     latents_path: str = field(
         default=None,
-        metadata={"help": "Path to latents, used for alignment"}, )
-    use_paddle_conv_init: bool = field(
-        default=False,
-        metadata={"help": "Whether or not use paddle conv2d init"})
+        metadata={"help": "Path to latents, used for alignment"},
+    )
+    use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init"})
     if_numpy_genarator_random_alignment: bool = field(
         default=False,
-        metadata={"help": "Whether to align random using numpy generator"}, )
+        metadata={"help": "Whether to align random using numpy generator"},
+    )
     numpy_genarator_random_seed: Optional[int] = field(
-        default=42, metadata={"help": "The random seed for numpy generator"})
-    set_seed_for_alignment: bool = field(
-        default=False,
-        metadata={"help": "Whether to set seed again for alignment"})
+        default=42, metadata={"help": "The random seed for numpy generator"}
+    )
+    set_seed_for_alignment: bool = field(default=False, metadata={"help": "Whether to set seed again for alignment"})
 
 
 @dataclass
@@ -128,8 +107,7 @@ class TrainerArguments:
     """
 
     # for log
-    image_logging_steps: Optional[int] = field(
-        default=1000, metadata={"help": "Log image every X steps."})
+    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
 
 
 @dataclass
@@ -140,27 +118,34 @@ class WebVidDatasetArguments:
 
     train_data_root: str = field(
         default="/root/data/lvdm/webvid/share_datasets",
-        metadata={"help": "The root path of train dataset files"}, )
+        metadata={"help": "The root path of train dataset files"},
+    )
     train_annotation_path: str = field(
         default="/root/data/lvdm/webvid/share_datasets/train_type_data.list",
-        metadata={"help": "The root path of train annotation"}, )
-    train_subset_split: str = field(
-        default="all", metadata={"help": "The train subset split"})
+        metadata={"help": "The root path of train annotation"},
+    )
+    train_subset_split: str = field(default="all", metadata={"help": "The train subset split"})
     eval_data_root: str = field(
         default="/root/data/lvdm/webvid/share_datasets",
-        metadata={"help": "The root path of validation dataset files"}, )
+        metadata={"help": "The root path of validation dataset files"},
+    )
     eval_annotation_path: str = field(
         default="/root/data/lvdm/webvid/share_datasets/val_type_data.list",
-        metadata={"help": "The root path of validation annotation"}, )
-    eval_subset_split: str = field(
-        default="all", metadata={"help": "The validation subset split"})
+        metadata={"help": "The root path of validation annotation"},
+    )
+    eval_subset_split: str = field(default="all", metadata={"help": "The validation subset split"})
     resolution: int = field(
         default=256,
-        metadata={"help": "The resolution"}, )
+        metadata={"help": "The resolution"},
+    )
     video_length: int = field(
         default=16,
-        metadata={"help": "The video length"}, )
-    frame_stride: int = field(default=4, )
+        metadata={"help": "The video length"},
+    )
+    frame_stride: int = field(
+        default=4,
+    )
     spatial_transform: str = field(
         default="center_crop_resize",
-        metadata={"help": "The spatial transform type to use"}, )
+        metadata={"help": "The spatial transform type to use"},
+    )
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py
index 9b00773644bbb..a087314494b33 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py
@@ -21,23 +21,34 @@
 import numpy as np
 import paddle
 import paddle.nn as nn
-from einops import rearrange, repeat
+from einops import rearrange
 from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
 from paddlenlp.utils.log import logger
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         LVDMAutoencoderKL, LVDMUNet3DModel,
-                         is_ppxformers_available)
-from ppdiffusers.initializer import (normal_, reset_initialized_parameter,
-                                     xavier_uniform_, zeros_)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    LVDMAutoencoderKL,
+    LVDMUNet3DModel,
+    is_ppxformers_available,
+)
+from ppdiffusers.initializer import (
+    normal_,
+    reset_initialized_parameter,
+    xavier_uniform_,
+    zeros_,
+)
 from ppdiffusers.models.ema import LitEma
-from ppdiffusers.models.lvdm_attention_temporal import (RelativePosition,
-                                                        TemporalCrossAttention)
+from ppdiffusers.models.lvdm_attention_temporal import (
+    RelativePosition,
+    TemporalCrossAttention,
+)
 from ppdiffusers.models.lvdm_distributions import DiagonalGaussianDistribution
 from ppdiffusers.training_utils import freeze_params
 
 
-def set_seed(seed: int=1234, args=None):
+def set_seed(seed: int = 1234, args=None):
     if args is None:
         random.seed(seed)
         np.random.seed(seed)
@@ -45,16 +56,14 @@ def set_seed(seed: int=1234, args=None):
 
     if args is not None:
         if args.use_hybrid_parallel:
-            from paddle.distributed.fleet.meta_parallel import \
-                get_rng_state_tracker
+            from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 
             random.seed(args.seed + args.dataset_rank)
             np.random.seed(args.seed + args.dataset_rank)
             paddle.seed(args.seed + args.dataset_rank)
 
             # local_seed/ global_seed is used to control dropout in ModelParallel
-            local_seed = (args.seed + 59999 + args.tensor_parallel_rank * 10 +
-                          args.pipeline_parallel_rank * 1000)
+            local_seed = args.seed + 59999 + args.tensor_parallel_rank * 10 + args.pipeline_parallel_rank * 1000
             global_seed = args.seed + 100003 + args.dataset_rank
             tracker = get_rng_state_tracker()
 
@@ -78,12 +87,10 @@ def split_video_to_clips(video, clip_length, drop_left=True):
     video_length = video.shape[2]
     shape = video.shape
     if video_length % clip_length != 0 and drop_left:
-        video = video[:, :, :video_length // clip_length * clip_length, :, :]
-        print(
-            f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
+        video = video[:, :, : video_length // clip_length * clip_length, :, :]
+        print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
     nclips = video_length // clip_length
-    clips = rearrange(
-        video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
+    clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
     return clips
 
 
@@ -104,17 +111,17 @@ def __init__(self, model_args):
         if model_args.task_type == "text2video":
             tokenizer_name_or_path = (
                 model_args.tokenizer_name_or_path
-                if model_args.pretrained_model_name_or_path is None else
-                os.path.join(model_args.pretrained_model_name_or_path,
-                             "tokenizer"))
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_name_or_path)
+                if model_args.pretrained_model_name_or_path is None
+                else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
 
         # init vae
         vae_name_or_path = (
             model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "vae"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "vae")
+        )
         self.vae_type = model_args.vae_type
         self.encoder_type = model_args.vae_type
         if model_args.vae_type == "2d":
@@ -122,7 +129,7 @@ def __init__(self, model_args):
         elif model_args.vae_type == "3d":
             self.vae = LVDMAutoencoderKL.from_pretrained(vae_name_or_path)
         else:
-            raise ValueError(f"`vae_type` to be `2d` or `3d`.")
+            raise ValueError("`vae_type` to be `2d` or `3d`.")
         freeze_params(self.vae.parameters())
         logger.info("Freeze vae parameters!")
 
@@ -130,16 +137,14 @@ def __init__(self, model_args):
         if model_args.task_type == "text2video":
             text_encoder_name_or_path = (
                 model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None else
-                os.path.join(model_args.pretrained_model_name_or_path,
-                             "text_encoder"))
+                if model_args.pretrained_model_name_or_path is None
+                else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
+            )
             self.text_encoder_is_pretrained = text_encoder_name_or_path is not None
             if self.text_encoder_is_pretrained:
-                self.text_encoder = CLIPTextModel.from_pretrained(
-                    text_encoder_name_or_path)
+                self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
             else:
-                self.text_encoder = CLIPTextModel(
-                    **read_json(model_args.text_encoder_config_file))
+                self.text_encoder = CLIPTextModel(**read_json(model_args.text_encoder_config_file))
                 self.init_text_encoder_weights()
             if not model_args.is_text_encoder_trainable:
                 freeze_params(self.text_encoder.parameters())
@@ -148,14 +153,14 @@ def __init__(self, model_args):
         # init unet
         unet_name_or_path = (
             model_args.unet_name_or_path
-            if model_args.pretrained_model_name_or_path is None else
-            os.path.join(model_args.pretrained_model_name_or_path, "unet"))
+            if model_args.pretrained_model_name_or_path is None
+            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
+        )
         self.unet_is_pretrained = model_args.pretrained_model_name_or_path is not None
         if self.unet_is_pretrained:
             self.unet = LVDMUNet3DModel.from_pretrained(unet_name_or_path)
         else:
-            self.unet = LVDMUNet3DModel(
-                **read_json(model_args.unet_config_file))
+            self.unet = LVDMUNet3DModel(**read_json(model_args.unet_config_file))
             self.init_unet_weights()
 
         # init train scheduler
@@ -163,7 +168,8 @@ def __init__(self, model_args):
             beta_start=model_args.scheduler_beta_start,
             beta_end=model_args.scheduler_beta_end,
             beta_schedule="scaled_linear",
-            num_train_timesteps=model_args.scheduler_num_train_timesteps, )
+            num_train_timesteps=model_args.scheduler_num_train_timesteps,
+        )
 
         # init eval scheduler
         self.eval_scheduler = DDIMScheduler(
@@ -173,23 +179,23 @@ def __init__(self, model_args):
             num_train_timesteps=model_args.scheduler_num_train_timesteps,
             steps_offset=1,
             clip_sample=False,
-            set_alpha_to_one=False, )
-        self.eval_scheduler.set_timesteps(
-            model_args.eval_scheduler_num_inference_steps)
+            set_alpha_to_one=False,
+        )
+        self.eval_scheduler.set_timesteps(model_args.eval_scheduler_num_inference_steps)
 
         # set training parameters
         self.use_ema = model_args.use_ema
         if self.use_ema:
             self.model_ema = LitEma(self.unet)
 
-        if (model_args.enable_xformers_memory_efficient_attention and
-                is_ppxformers_available()):
+        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
             try:
                 self.unet.enable_xformers_memory_efficient_attention()
             except Exception as e:
                 logger.warn(
                     "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}")
+                    f" correctly and a GPU is available: {e}"
+                )
         self.scale_factor = model_args.scale_factor
         self.shift_factor = model_args.shift_factor
         self.loss_type = model_args.loss_type
@@ -198,24 +204,19 @@ def __init__(self, model_args):
         self.use_preconfig_latents = False
         if model_args.latents_path:
             self.use_preconfig_latents = True
-            self.register_buffer("preconfig_latents",
-                                 paddle.load(model_args.latents_path))
+            self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path))
 
-        self.if_numpy_genarator_random_alignment = (
-            model_args.if_numpy_genarator_random_alignment)
+        self.if_numpy_genarator_random_alignment = model_args.if_numpy_genarator_random_alignment
         if self.if_numpy_genarator_random_alignment:
-            self.generator = np.random.RandomState(
-                model_args.numpy_genarator_random_seed)
+            self.generator = np.random.RandomState(model_args.numpy_genarator_random_seed)
 
         self.set_seed_for_alignment = model_args.set_seed_for_alignment
 
     def init_text_encoder_weights(self):
         if not self.text_encoder_is_pretrained:
             reset_initialized_parameter(self.text_encoder)
-            normal_(self.text_encoder.embeddings.word_embeddings.weight, 0,
-                    0.02)
-            normal_(self.text_encoder.embeddings.position_embeddings.weight, 0,
-                    0.02)
+            normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02)
+            normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02)
 
     def init_unet_weights(self):
         if not self.unet_is_pretrained:
@@ -256,9 +257,7 @@ def get_first_stage_encoding(self, encoder_posterior, noise=None):
         elif isinstance(encoder_posterior, paddle.Tensor):
             z = encoder_posterior
         else:
-            raise NotImplementedError(
-                f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented"
-            )
+            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
         z = self.scale_factor * (z + self.shift_factor)
         return z
 
@@ -291,12 +290,7 @@ def decode(self, z, **kwargs):
         return results
 
     @paddle.no_grad()
-    def overlapped_decode(self,
-                          z,
-                          max_z_t=None,
-                          overlap_t=2,
-                          predict_cids=False,
-                          force_not_quantize=False):
+    def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False):
         if max_z_t is None:
             max_z_t = z.shape[2]
         assert max_z_t > overlap_t
@@ -315,69 +309,56 @@ def overlapped_decode(self,
         reses = []
         for i, z_ in enumerate(zs):
             if i == 0:
-                res = self.decode(
-                    z_, predict_cids,
-                    force_not_quantize).cpu()[:, :, :max_x_t - drop_r_x, :, :]
+                res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :]
             elif i == len(zs) - 1:
-                res = self.decode(
-                    z_, predict_cids,
-                    force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
+                res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
             else:
-                res = self.decode(z_, predict_cids, force_not_quantize).cpu(
-                )[:, :, drop_l_x:max_x_t - drop_r_x, :, :]
+                res = self.decode(z_, predict_cids, force_not_quantize).cpu()[
+                    :, :, drop_l_x : max_x_t - drop_r_x, :, :
+                ]
             reses.append(res)
         results = paddle.concat(x=reses, axis=2)
         return results
 
     @paddle.no_grad()
-    def decode_first_stage_2DAE_video(self,
-                                      z,
-                                      decode_bs=16,
-                                      return_cpu=True,
-                                      **kwargs):
+    def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs):
         b, _, t, _, _ = z.shape
         z = rearrange(z, "b c t h w -> (b t) c h w")
         if decode_bs is None:
             results = self.decode(z, **kwargs)
         else:
-            z = paddle.split(
-                x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
+            z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
             if return_cpu:
-                results = paddle.concat(
-                    x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
+                results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
             else:
-                results = paddle.concat(
-                    x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
-        results = rearrange(
-            results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
+                results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
+        results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
         return results
 
     @paddle.no_grad()
     def decode_latents(
-            self,
-            z,
-            decode_bs=16,
-            return_cpu=True,
-            bs=None,
-            decode_single_video_allframes=False,
-            max_z_t=None,
-            overlapped_length=0,
-            **kwargs, ):
+        self,
+        z,
+        decode_bs=16,
+        return_cpu=True,
+        bs=None,
+        decode_single_video_allframes=False,
+        max_z_t=None,
+        overlapped_length=0,
+        **kwargs,
+    ):
         b, _, t, _, _ = z.shape
         if self.encoder_type == "2d" and z.dim() == 5:
-            return self.decode_first_stage_2DAE_video(
-                z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
+            return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
         if decode_single_video_allframes:
             z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0)
             cat_dim = 0
         elif max_z_t is not None:
             if self.encoder_type == "3d":
-                z = paddle.split(
-                    x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
+                z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
                 cat_dim = 2
             if self.encoder_type == "2d":
-                z = paddle.split(
-                    x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
+                z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
                 cat_dim = 0
         # elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[
         #     2
@@ -410,8 +391,7 @@ def get_loss(self, pred, target, mean=True, mask=None):
             if mean:
                 loss = paddle.nn.functional.mse_loss(target, pred)
             else:
-                loss = paddle.nn.functional.mse_loss(
-                    target, pred, reduction="none")
+                loss = paddle.nn.functional.mse_loss(target, pred, reduction="none")
         else:
             raise NotImplementedError("unknown loss type '{loss_type}'")
         if mask is not None:
@@ -438,18 +418,18 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
                         self.generator.randint(
                             0,
                             self.noise_scheduler.num_train_timesteps,
-                            size=(latents.shape[0], ), ),
-                        dtype="int64", )
-                    noise = paddle.to_tensor(
-                        self.generator.randn(*latents.shape), dtype="float32")
+                            size=(latents.shape[0],),
+                        ),
+                        dtype="int64",
+                    )
+                    noise = paddle.to_tensor(self.generator.randn(*latents.shape), dtype="float32")
                 else:
                     timesteps = paddle.randint(
-                        0, self.noise_scheduler.num_train_timesteps,
-                        (latents.shape[0], )).astype("int64")
+                        0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)
+                    ).astype("int64")
                     noise = paddle.randn_like(latents)
 
-                noisy_latents = self.noise_scheduler.add_noise(latents, noise,
-                                                               timesteps)
+                noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
                 encoder_hidden_states = None
                 if self.task_type == "text2video":
                     encoder_hidden_states = self.text_encoder(input_ids)[0]
@@ -458,7 +438,8 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs):
         noise_pred = self.unet(
             noisy_latents,
             timesteps,
-            context=encoder_hidden_states, ).sample
+            context=encoder_hidden_states,
+        ).sample
 
         loss = self.get_loss(noise_pred, noise, mean=True)
         return loss
@@ -485,20 +466,19 @@ def log_reconstruct_frames(self, pixel_values=None, **kwargs):
 
     @paddle.no_grad()
     def log_text2video_sample_frames(
-            self,
-            input_ids=None,
-            height=256,
-            width=256,
-            eta=1.0,
-            guidance_scale=9,
-            num_frames=16,
-            **kwargs, ):
+        self,
+        input_ids=None,
+        height=256,
+        width=256,
+        eta=1.0,
+        guidance_scale=9,
+        num_frames=16,
+        **kwargs,
+    ):
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log 2 video
             if input_ids.shape[0] > 2:
                 input_ids = input_ids[:2]
@@ -512,10 +492,10 @@ def log_text2video_sample_frames(
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
                 uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings], axis=0)
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
             if self.use_preconfig_latents:
                 latents = self.preconfig_latents
             else:
@@ -528,36 +508,32 @@ def log_text2video_sample_frames(
                 ]
                 latents = paddle.randn(shape)
 
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
 
             for t in self.eval_scheduler.timesteps:
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
 
                 # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
-                    context=text_embeddings, ).sample
+                    context=text_embeddings,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             sampled_videos = self.decode_latents(latents)
 
@@ -574,19 +550,11 @@ def log_text2video_sample_frames(
         return videos_frames
 
     @paddle.no_grad()
-    def log_short_sample_frames(self,
-                                height=256,
-                                width=256,
-                                eta=0.0,
-                                guidance_scale=9,
-                                num_frames=16,
-                                **kwargs):
+    def log_short_sample_frames(self, height=256, width=256, eta=0.0, guidance_scale=9, num_frames=16, **kwargs):
         self.eval()
         with self.ema_scope():
             if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(
-                    f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-                )
+                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
             # only log 2 video
             batch_size = 2
 
@@ -602,8 +570,7 @@ def log_short_sample_frames(self,
                 ]
                 latents = paddle.randn(shape)
 
-            accepts_eta = "eta" in set(
-                inspect.signature(self.eval_scheduler.step).parameters.keys())
+            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
             extra_step_kwargs = {}
             if accepts_eta:
                 extra_step_kwargs["eta"] = eta
@@ -613,17 +580,16 @@ def log_short_sample_frames(self,
                 latent_model_input = latents
 
                 # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
-                    t, ).sample
+                    t,
+                ).sample
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             sampled_videos = self.decode_latents(latents)
 
@@ -643,7 +609,6 @@ def set_recompute(self, value=False):
         def fn(layer):
             if hasattr(layer, "gradient_checkpointing"):
                 layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute",
-                      layer.gradient_checkpointing)
+                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
 
         self.unet.apply(fn)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py
index 90d32ee1eda0b..9fa09eb560f4c 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py
@@ -19,8 +19,11 @@
 import paddle.amp.auto_cast as autocast
 from paddle.io import DataLoader
 from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK,
-                                            VisualDLCallback, rewrite_logs)
+from paddlenlp.trainer.integrations import (
+    INTEGRATION_TO_CALLBACK,
+    VisualDLCallback,
+    rewrite_logs,
+)
 from paddlenlp.trainer.utils.helper import nested_detach
 from paddlenlp.utils.log import logger
 
@@ -39,19 +42,17 @@ def autocast_smart_context_manager(self, args):
                     "c_softmax_with_cross_entropy",
                 ],
                 level=args.fp16_opt_level,
-                dtype=amp_dtype, )
+                dtype=amp_dtype,
+            )
         else:
-            ctx_manager = (contextlib.nullcontext()
-                           if sys.version_info >= (3, 7) else
-                           contextlib.suppress())
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
         return ctx_manager
 
     def on_step_end(self, args, state, control, model=None, **kwargs):
         if hasattr(model, "on_train_batch_end"):
             model.on_train_batch_end()
-        if (args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
             control.should_log = True
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -62,27 +63,30 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         inputs = kwargs.get("inputs", None)
         model = kwargs.get("model", None)
         image_logs = {}
-        if (inputs is not None and model is not None and
-                args.image_logging_steps > 0 and
-                state.global_step % args.image_logging_steps == 0):
+        if (
+            inputs is not None
+            and model is not None
+            and args.image_logging_steps > 0
+            and state.global_step % args.image_logging_steps == 0
+        ):
             with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.log_reconstruct_frames(
-                    pixel_values=inputs["pixel_values"])
+                image_logs["reconstruction"] = model.log_reconstruct_frames(pixel_values=inputs["pixel_values"])
                 if model.task_type == "text2video":
-                    image_logs[
-                        "ddim-samples"] = model.log_text2video_sample_frames(
-                            input_ids=inputs["input_ids"],
-                            height=256,
-                            width=256,
-                            eta=1.0,
-                            guidance_scale=9,
-                            num_frames=16, )
+                    image_logs["ddim-samples"] = model.log_text2video_sample_frames(
+                        input_ids=inputs["input_ids"],
+                        height=256,
+                        width=256,
+                        eta=1.0,
+                        guidance_scale=9,
+                        num_frames=16,
+                    )
                 elif model.task_type == "short":
                     image_logs["ddim-samples"] = model.log_short_sample_frames(
                         height=256,
                         width=256,
                         eta=1.0,
-                        num_frames=16, )
+                        num_frames=16,
+                    )
 
         if self.vdl_writer is None:
             self._init_summary_writer(args)
@@ -97,11 +101,11 @@ def on_log(self, args, state, control, logs=None, **kwargs):
                         "Trainer is attempting to log a value of "
                         f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                         "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute.")
+                        "is incorrect so we dropped this attribute."
+                    )
             # log images
             for k, v in image_logs.items():
-                self.vdl_writer.add_image(
-                    k, v, state.global_step, dataformats="NHWC")
+                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
             self.vdl_writer.flush()
 
 
@@ -117,43 +121,41 @@ def compute_loss(self, model, inputs, return_outputs=False):
     def get_train_dataloader(self):
         if self.train_dataset is None:
             raise ValueError("Trainer: training requires a train_dataset.")
-        if isinstance(self.train_dataset, VideoFrameDataset) or isinstance(
-                self.train_dataset, WebVidDataset):
+        if isinstance(self.train_dataset, VideoFrameDataset) or isinstance(self.train_dataset, WebVidDataset):
             return DataLoader(
                 self.train_dataset,
                 batch_size=self.args.train_batch_size,
                 num_workers=self.args.dataloader_num_workers,
                 shuffle=True,
                 worker_init_fn=None,
-                collate_fn=None, )
+                collate_fn=None,
+            )
         else:
             return super().get_train_dataloader()
 
     def prediction_step(
-            self,
-            model,
-            inputs,
-            prediction_loss_only,
-            ignore_keys, ):
+        self,
+        model,
+        inputs,
+        prediction_loss_only,
+        ignore_keys,
+    ):
         if self.args.pipeline_parallel_degree > 1:
             # hack for pipeline mode
             inputs = self._prepare_inputs(inputs)
-            return self.prediction_pipeline_step(
-                model, inputs, prediction_loss_only, ignore_keys)
+            return self.prediction_pipeline_step(model, inputs, prediction_loss_only, ignore_keys)
 
         has_labels = all(inputs.get(k) is not None for k in self.label_names)
         inputs = self._prepare_inputs(inputs)
         if ignore_keys is None:
             if hasattr(self.model, "config"):
-                ignore_keys = getattr(self.model.config,
-                                      "keys_to_ignore_at_inference", [])
+                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
             else:
                 ignore_keys = []
 
         # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
         if has_labels:
-            labels = nested_detach(
-                tuple(inputs.get(name) for name in self.label_names))
+            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
             if len(labels) == 1:
                 labels = labels[0]
         else:
diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py
index 345c3311c88cd..b6636d5924fec 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py
@@ -38,16 +38,17 @@ class WebVidDataset(paddle.io.Dataset):
     """
 
     def __init__(
-            self,
-            data_root,
-            resolution,
-            video_length,
-            subset_split,
-            frame_stride,
-            spatial_transform="",
-            load_method="decord",
-            annotation_path=None,
-            tokenizer=None, ):
+        self,
+        data_root,
+        resolution,
+        video_length,
+        subset_split,
+        frame_stride,
+        spatial_transform="",
+        load_method="decord",
+        annotation_path=None,
+        tokenizer=None,
+    ):
         self.annotation_path = annotation_path
         self.data_root = data_root
         self.resolution = resolution
@@ -57,9 +58,7 @@ def __init__(
         self.spatial_transform = spatial_transform
         self.load_method = load_method
         assert self.load_method in ["decord", "readvideo", "videoclips"]
-        assert self.subset_split in [
-            "train", "test", "all", "results_10M_train"
-        ]
+        assert self.subset_split in ["train", "test", "all", "results_10M_train"]
         self.exts = ["avi", "mp4", "webm"]
         if isinstance(self.resolution, int):
             self.resolution = [self.resolution, self.resolution]
@@ -67,22 +66,23 @@ def __init__(
         self.max_resolution = max(self.resolution)
         if self.spatial_transform == "center_crop_resize":
             print("Spatial transform: center crop and then resize")
-            self.video_transform = paddle.vision.transforms.Compose([
-                paddle.vision.transforms.Resize(resolution),
-                CenterCropVideo(resolution),
-            ])
-            self.video_transform_step1 = paddle.vision.transforms.Compose([
-                paddle.vision.transforms.Resize(resolution),
-            ])
-            self.video_transform_step2 = paddle.vision.transforms.Compose(
-                [CenterCropVideo(resolution)])
+            self.video_transform = paddle.vision.transforms.Compose(
+                [
+                    paddle.vision.transforms.Resize(resolution),
+                    CenterCropVideo(resolution),
+                ]
+            )
+            self.video_transform_step1 = paddle.vision.transforms.Compose(
+                [
+                    paddle.vision.transforms.Resize(resolution),
+                ]
+            )
+            self.video_transform_step2 = paddle.vision.transforms.Compose([CenterCropVideo(resolution)])
         elif self.spatial_transform == "resize":
             print("Spatial transform: resize with no crop")
-            self.video_transform = paddle.vision.transforms.Resize(
-                (resolution, resolution))
+            self.video_transform = paddle.vision.transforms.Resize((resolution, resolution))
         elif self.spatial_transform == "random_crop":
-            self.video_transform = paddle.vision.transforms.Compose(
-                [RandomCropVideo(resolution)])
+            self.video_transform = paddle.vision.transforms.Compose([RandomCropVideo(resolution)])
         elif self.spatial_transform == "":
             self.video_transform = None
         else:
@@ -96,7 +96,8 @@ def __init__(
                 truncation=True,
                 max_length=tokenizer.model_max_length,
                 return_tensors="pd",
-                return_overflowing_tokens=False, ).input_ids[0]
+                return_overflowing_tokens=False,
+            ).input_ids[0]
         else:
             self.text_processing = None
 
@@ -111,12 +112,9 @@ def _make_dataset(self):
                 self.annotations = fp.read().splitlines()
         else:
             self.annotations = sum(
-                [
-                    glob.glob(
-                        os.path.join(data_folder, "**", f"*.{ext}"),
-                        recursive=True) for ext in self.exts
-                ],
-                [], )
+                [glob.glob(os.path.join(data_folder, "**", f"*.{ext}"), recursive=True) for ext in self.exts],
+                [],
+            )
         print(f"Number of videos = {len(self.annotations)}")
 
     def get_annotation(self, index):
@@ -140,7 +138,8 @@ def get_data_decord(self, index):
                     video_path,
                     ctx=cpu(0),
                     width=self.max_resolution,
-                    height=self.max_resolution, )
+                    height=self.max_resolution,
+                )
                 if len(video_reader) < self.video_length:
                     index += 1
                     continue
@@ -155,23 +154,20 @@ def get_data_decord(self, index):
         rand_idx = random.randint(0, len(all_frames) - self.video_length)
         frame_indices = list(range(rand_idx, rand_idx + self.video_length))
         frames = video_reader.get_batch(frame_indices)
-        assert (frames.shape[0] == self.video_length
-                ), f"{len(frames)}, self.video_length={self.video_length}"
-        frames = (paddle.to_tensor(data=frames.asnumpy())
-                  .astype(dtype="float32").transpose(perm=[0, 3, 1, 2]))
+        assert frames.shape[0] == self.video_length, f"{len(frames)}, self.video_length={self.video_length}"
+        frames = paddle.to_tensor(data=frames.asnumpy()).astype(dtype="float32").transpose(perm=[0, 3, 1, 2])
         if self.video_transform is not None:
             if self.spatial_transform == "center_crop_resize":
                 temp_frames = rearrange(frames, "c t h w -> (c t) h w")
                 temp_frames = self.video_transform_step1(temp_frames)
-                frames = rearrange(
-                    temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
+                frames = rearrange(temp_frames, "(c t) h w -> c t h w", c=frames.shape[0])
                 frames = self.video_transform_step2(frames)
             else:
                 frames = self.video_transform(frames)
         frames = frames.transpose(perm=[1, 0, 2, 3]).astype(dtype="float32")
-        assert (frames.shape[2] == self.resolution[0] and
-                frames.shape[3] == self.resolution[1]
-                ), f"frames={frames.shape}, self.resolution={self.resolution}"
+        assert (
+            frames.shape[2] == self.resolution[0] and frames.shape[3] == self.resolution[1]
+        ), f"frames={frames.shape}, self.resolution={self.resolution}"
         frames = (frames / 255 - 0.5) * 2
         data = {"video": frames, "caption": caption}
 
@@ -181,7 +177,9 @@ def get_data_decord(self, index):
                 "input_ids": self.text_processing(data["caption"]),
             }
         else:
-            tensor_out = {"pixel_values": data["video"], }
+            tensor_out = {
+                "pixel_values": data["video"],
+            }
         return tensor_out
 
     def get_data_readvideo(self, index):
@@ -215,9 +213,9 @@ def main():
         subset_split=subset_split,
         frame_stride=frame_stride,
         spatial_transform=spatial_transform,
-        annotation_path=annotation_path, )
-    dataloader = paddle.io.data.DataLoader(
-        dataset, batch_size=2, shuffle=False, num_workers=0)
+        annotation_path=annotation_path,
+    )
+    dataloader = paddle.io.data.DataLoader(dataset, batch_size=2, shuffle=False, num_workers=0)
     starttime = time.time()
     for id, data in enumerate(dataloader):
         endtime = time.time()
@@ -227,7 +225,8 @@ def main():
             endtime - starttime,
             " shape:",
             data["video"].shape,
-            data["caption"], )
+            data["caption"],
+        )
         starttime = endtime
     return
 
diff --git a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py
index f0ef60f1d4cfd..33a27a91410e8 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py
@@ -17,8 +17,7 @@
 from ppdiffusers import LVDMUncondPipeline
 
 # 加载模型和scheduler
-pipe = LVDMUncondPipeline.from_pretrained(
-    "westfish/lvdm_short_sky_epoch2239_step150079")
+pipe = LVDMUncondPipeline.from_pretrained("westfish/lvdm_short_sky_epoch2239_step150079")
 
 # 执行pipeline进行推理
 seed = 1000
@@ -32,4 +31,5 @@
     save_dir=".",
     save_name="ddim_lvdm_short_sky_epoch2239_step150079",
     scale_factor=0.33422927,
-    shift_factor=1.4606637, )
+    shift_factor=1.4606637,
+)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py
index 520ee5339fbde..bbd9587186d87 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py
@@ -17,8 +17,7 @@
 from ppdiffusers import LVDMTextToVideoPipeline
 
 # 加载模型和scheduler
-pipe = LVDMTextToVideoPipeline.from_pretrained(
-    "westfish/lvdm_text2video_orig_webvid_2m")
+pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
 
 # 执行pipeline进行推理
 seed = 2013
@@ -36,4 +35,5 @@
     save_name="ddim_lvdm_text_to_video_ucf",
     encoder_type="2d",
     scale_factor=0.18215,
-    shift_factor=0, )
+    shift_factor=0,
+)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py
index d562f6ff8b359..2db650c780345 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py
@@ -27,13 +27,19 @@
     raise ImportError(
         "OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`."
     )
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, LMSDiscreteScheduler, LVDMAutoencoderKL,
-    LVDMUncondPipeline, LVDMUNet3DModel, PNDMScheduler)
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    LVDMAutoencoderKL,
+    LVDMUncondPipeline,
+    LVDMUNet3DModel,
+    PNDMScheduler,
+)
 
 paddle.set_device("cpu")
 MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
@@ -116,8 +122,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
     FILENAME = f"archive/{file_name}".encode("latin")
     padding_size_plus_fbxx = 4 + 14
     data_iostream = []
-    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(
-        FILENAME) + padding_size_plus_fbxx
+    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
     with open(file, "rb") as r:
         r.seek(offset)
         for bytes_data in io.BytesIO(r.read()):
@@ -130,8 +135,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
     return out, offset + len(out)
 
 
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
-                          backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
     if isinstance(storage, TensorMeta):
         storage.size = size
     return storage
@@ -162,7 +166,8 @@ def create_unet_diffusers_config(original_config):
         padding_t=unet_params.padding_t,
         temporal_length=unet_params.temporal_length,
         use_relative_position=unet_params.use_relative_position,
-        use_scale_shift_norm=unet_params.use_scale_shift_norm, )
+        use_scale_shift_norm=unet_params.use_scale_shift_norm,
+    )
 
     return config
 
@@ -181,7 +186,8 @@ def create_lvdm_vae_diffusers_config(original_config):
         padding_type=vae_params.encoder.params.padding_type,
         double_z=vae_params.encoder.params.double_z,
         z_channels=vae_params.encoder.params.z_channels,
-        upsample=vae_params.decoder.params.upsample, )
+        upsample=vae_params.decoder.params.upsample,
+    )
     return config
 
 
@@ -190,14 +196,12 @@ def create_diffusers_schedular(original_config):
         num_train_timesteps=original_config.model.params.timesteps,
         beta_start=original_config.model.params.linear_start,
         beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear", )
+        beta_schedule="scaled_linear",
+    )
     return schedular
 
 
-def convert_lvdm_unet_checkpoint(checkpoint,
-                                 config,
-                                 path=None,
-                                 extract_ema=False):
+def convert_lvdm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -218,8 +222,7 @@ def convert_lvdm_unet_checkpoint(checkpoint,
             for key in keys:
                 if key.startswith("model.diffusion_model"):
                     flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                        flat_ema_key)
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
         else:
             print(
                 "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -251,9 +254,7 @@ def convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, config):
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint,
-                                              dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -275,8 +276,7 @@ def check_keys(model, state_dict):
         if k not in state_dict.keys():
             missing_keys.append(k)
         if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(
-                str((k, list(v.shape), list(state_dict[k].shape))))
+            mismatched_keys.append(str((k, list(v.shape), list(state_dict[k].shape))))
     if len(missing_keys):
         missing_keys_str = ", ".join(missing_keys)
         print(f"{cls_name} Found missing_keys {missing_keys_str}!")
@@ -293,13 +293,15 @@ def check_keys(model, state_dict):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--vae_checkpoint_path",
         default=None,
         type=str,
         required=False,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -325,13 +327,15 @@ def check_keys(model, state_dict):
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
             " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
+        help="Path to the output model.",
+    )
     args = parser.parse_args()
 
     # image_size = 512
@@ -340,15 +344,13 @@ def check_keys(model, state_dict):
 
     vae_checkpoint = None
     if args.vae_checkpoint_path:
-        vae_checkpoint = torch.load(
-            args.vae_checkpoint_path, map_location="cpu")
+        vae_checkpoint = torch.load(args.vae_checkpoint_path, map_location="cpu")
         vae_checkpoint = vae_checkpoint.get("state_dict", vae_checkpoint)
 
     original_config = OmegaConf.load(args.original_config_file)
 
     if args.num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"][
-            "in_channels"] = args.num_in_channels
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
 
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
@@ -361,7 +363,8 @@ def check_keys(model, state_dict):
         num_train_timesteps=num_train_timesteps,
         steps_offset=1,
         clip_sample=False,
-        set_alpha_to_one=False, )
+        set_alpha_to_one=False,
+    )
 
     # make sure scheduler works correctly with DDIM
     scheduler.register_to_config(clip_sample=False)
@@ -377,15 +380,13 @@ def check_keys(model, state_dict):
     elif args.scheduler_type == "euler":
         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            scheduler.config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "dpm":
         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "ddim":
         scheduler = scheduler
     else:
-        raise ValueError(
-            f"Scheduler of type {args.scheduler_type} doesn't exist!")
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
 
     # 1. Convert the LVDMUNet3DModel model.
     diffusers_unet_config = create_unet_diffusers_config(original_config)
@@ -393,26 +394,25 @@ def check_keys(model, state_dict):
         checkpoint,
         diffusers_unet_config,
         path=args.checkpoint_path,
-        extract_ema=args.extract_ema, )
+        extract_ema=args.extract_ema,
+    )
     unet = LVDMUNet3DModel.from_config(diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        unet, diffusers_unet_checkpoint)
+    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
     check_keys(unet, ppdiffusers_unet_checkpoint)
     unet.load_dict(ppdiffusers_unet_checkpoint)
 
     # 2. Convert the LVDMAutoencoderKL model.
     vae_config = create_lvdm_vae_diffusers_config(original_config)
-    diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(
-        checkpoint, vae_checkpoint, vae_config)
+    diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config)
     vae = LVDMAutoencoderKL.from_config(vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        vae, diffusers_vae_checkpoint)
+    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
     check_keys(vae, ppdiffusers_vae_checkpoint)
     vae.load_dict(ppdiffusers_vae_checkpoint)
 
     pipe = LVDMUncondPipeline(
         vae=vae,
         unet=unet,
-        scheduler=scheduler, )
+        scheduler=scheduler,
+    )
 
     pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py
index 0b09aa164dfe5..0662e05b5bcaa 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py
@@ -27,13 +27,20 @@
         "OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`."
     )
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from transformers import CLIPTextModel as HFCLIPTextModel
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, LMSDiscreteScheduler, LVDMAutoencoderKL,
-    LVDMTextToVideoPipeline, LVDMUNet3DModel, PNDMScheduler)
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    LVDMAutoencoderKL,
+    LVDMTextToVideoPipeline,
+    LVDMUNet3DModel,
+    PNDMScheduler,
+)
 
 paddle.set_device("cpu")
 MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
@@ -116,8 +123,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
     FILENAME = f"archive/{file_name}".encode("latin")
     padding_size_plus_fbxx = 4 + 14
     data_iostream = []
-    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(
-        FILENAME) + padding_size_plus_fbxx
+    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
     with open(file, "rb") as r:
         r.seek(offset)
         for bytes_data in io.BytesIO(r.read()):
@@ -130,8 +136,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
     return out, offset + len(out)
 
 
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
-                          backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
     if isinstance(storage, TensorMeta):
         storage.size = size
     return storage
@@ -160,8 +165,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -191,8 +195,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -200,12 +203,13 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
     to them. It splits attention layers, and takes into account additional replacements
@@ -213,9 +217,7 @@ def assign_to_checkpoint(
 
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -223,13 +225,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
 
             query, key, value = np.split(old_tensor, 3, axis=1)
 
@@ -241,8 +241,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -252,8 +251,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -297,7 +295,8 @@ def create_unet_diffusers_config(original_config):
         kernel_size_t=unet_params.kernel_size_t,
         padding_t=unet_params.padding_t,
         temporal_length=unet_params.temporal_length,
-        use_relative_position=unet_params.use_relative_position, )
+        use_relative_position=unet_params.use_relative_position,
+    )
 
     return config
 
@@ -321,7 +320,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks, )
+        layers_per_block=vae_params.num_res_blocks,
+    )
     return config
 
 
@@ -339,7 +339,8 @@ def create_lvdm_vae_diffusers_config(original_config):
         padding_type=vae_params.encoder.params.padding_type,
         double_z=vae_params.encoder.params.double_z,
         z_channels=vae_params.encoder.params.z_channels,
-        upsample=vae_params.decoder.params.upsample, )
+        upsample=vae_params.decoder.params.upsample,
+    )
     return config
 
 
@@ -348,14 +349,12 @@ def create_diffusers_schedular(original_config):
         num_train_timesteps=original_config.model.params.timesteps,
         beta_start=original_config.model.params.linear_start,
         beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear", )
+        beta_schedule="scaled_linear",
+    )
     return schedular
 
 
-def convert_lvdm_unet_checkpoint(checkpoint,
-                                 config,
-                                 path=None,
-                                 extract_ema=False):
+def convert_lvdm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -376,8 +375,7 @@ def convert_lvdm_unet_checkpoint(checkpoint,
             for key in keys:
                 if key.startswith("model.diffusion_model"):
                     flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                        flat_ema_key)
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
         else:
             print(
                 "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -407,107 +405,74 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config):
     # new_checkpoint = vae_state_dict
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -515,58 +480,50 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -574,7 +531,8 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
@@ -595,9 +553,7 @@ def convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, config):
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint,
-                                              dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -619,8 +575,7 @@ def check_keys(model, state_dict):
         if k not in state_dict.keys():
             missing_keys.append(k)
         elif list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(
-                str((k, list(v.shape), list(state_dict[k].shape))))
+            mismatched_keys.append(str((k, list(v.shape), list(state_dict[k].shape))))
     if len(missing_keys):
         missing_keys_str = ", ".join(missing_keys)
         print(f"{cls_name} Found missing_keys {missing_keys_str}!")
@@ -633,7 +588,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
     clip = {}
     for key in checkpoint.keys():
         if key.startswith("cond_stage_model.transformer"):
-            clip[key[len("cond_stage_model.transformer."):]] = checkpoint[key]
+            clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
 
     new_model_state = {}
     transformers2ppnlp = {
@@ -653,9 +608,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
     for name, value in clip.items():
         # step1: ignore position_ids
         if any(i in name for i in ignore_value):
@@ -668,16 +621,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         new_model_state[name] = value.cpu().numpy().astype(dtype)
 
     new_config = {
-        "max_text_length":
-        new_model_state["text_model.positional_embedding.weight"].shape[0],
-        "vocab_size":
-        new_model_state["text_model.token_embedding.weight"].shape[0],
-        "text_embed_dim":
-        new_model_state["text_model.token_embedding.weight"].shape[1],
+        "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0],
+        "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0],
+        "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1],
         "text_heads": 12,
         "text_layers": 12,
         "text_hidden_act": "quick_gelu",
@@ -696,19 +646,22 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--vae_checkpoint_path",
         default=None,
         type=str,
         required=False,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--vae_type",
         default="2d",
         type=str,
         required=False,
-        help="The type of vae, chosen from [`2d `, `3d`].", )
+        help="The type of vae, chosen from [`2d `, `3d`].",
+    )
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -734,13 +687,15 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
             " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
+        help="Path to the output model.",
+    )
     args = parser.parse_args()
 
     image_size = 512
@@ -750,15 +705,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
 
     vae_checkpoint = None
     if args.vae_checkpoint_path:
-        vae_checkpoint = torch.load(
-            args.vae_checkpoint_path, map_location="cpu")
+        vae_checkpoint = torch.load(args.vae_checkpoint_path, map_location="cpu")
         vae_checkpoint = vae_checkpoint.get("state_dict", vae_checkpoint)
 
     original_config = OmegaConf.load(args.original_config_file)
 
     if args.num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"][
-            "in_channels"] = args.num_in_channels
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
 
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
@@ -771,7 +724,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         num_train_timesteps=num_train_timesteps,
         steps_offset=1,
         clip_sample=False,
-        set_alpha_to_one=False, )
+        set_alpha_to_one=False,
+    )
     # make sure scheduler works correctly with DDIM
     scheduler.register_to_config(clip_sample=False)
 
@@ -786,15 +740,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
     elif args.scheduler_type == "euler":
         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            scheduler.config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "dpm":
         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "ddim":
         scheduler = scheduler
     else:
-        raise ValueError(
-            f"Scheduler of type {args.scheduler_type} doesn't exist!")
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
 
     # 1. Convert the LVDMUNet3DModel model.
     diffusers_unet_config = create_unet_diffusers_config(original_config)
@@ -802,46 +754,41 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         checkpoint,
         diffusers_unet_config,
         path=args.checkpoint_path,
-        extract_ema=args.extract_ema, )
+        extract_ema=args.extract_ema,
+    )
     unet = LVDMUNet3DModel.from_config(diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        unet, diffusers_unet_checkpoint)
+    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
     check_keys(unet, ppdiffusers_unet_checkpoint)
     unet.load_dict(ppdiffusers_unet_checkpoint)
 
     # 2. Convert the AutoencoderKL model.
     if args.vae_type == "2d":
-        vae_config = create_vae_diffusers_config(
-            original_config, image_size=image_size)
-        diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(
-            checkpoint, vae_checkpoint, vae_config)
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config)
         vae = AutoencoderKL.from_config(vae_config)
     else:
         vae_config = create_lvdm_vae_diffusers_config(original_config)
-        diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(
-            checkpoint, vae_checkpoint, vae_config)
+        diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config)
         vae = LVDMAutoencoderKL.from_config(vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        vae, diffusers_vae_checkpoint)
+    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
     check_keys(vae, ppdiffusers_vae_checkpoint)
     vae.load_dict(ppdiffusers_vae_checkpoint)
 
     # 3. Convert the text model.
-    text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
-        checkpoint, dtype="float32")
+    text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32")
     text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_config))
     text_encoder.eval()
     check_keys(text_encoder, text_model_state_dict)
     text_encoder.load_dict(text_model_state_dict)
 
     # 4. load tokenizer.
-    pp_tokenizer = CLIPTokenizer.from_pretrained(
-        "openai/clip-vit-large-patch14")
+    pp_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 
     pipe = LVDMTextToVideoPipeline(
         vae=vae,
         text_encoder=text_encoder,
         tokenizer=pp_tokenizer,
         unet=unet,
-        scheduler=scheduler, )
+        scheduler=scheduler,
+    )
     pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py
index 967fa9cd80f36..2eba6ece4b713 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py
@@ -16,34 +16,40 @@
 import os
 
 import paddle
-from lvdm import (LatentVideoDiffusion, LatentVideoDiffusionTrainer,
-                  VideoFrameDataset)
-from lvdm.lvdm_args_short import (ModelArguments, TrainerArguments,
-                                  VideoFrameDatasetArguments)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from lvdm import LatentVideoDiffusion, LatentVideoDiffusionTrainer, VideoFrameDataset
+from lvdm.lvdm_args_short import (
+    ModelArguments,
+    TrainerArguments,
+    VideoFrameDatasetArguments,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
 from paddlenlp.utils.log import logger
 
 
 def main():
-    parser = PdArgumentParser((
-        ModelArguments,
-        VideoFrameDatasetArguments,
-        TrainerArguments,
-        TrainingArguments, ))
+    parser = PdArgumentParser(
+        (
+            ModelArguments,
+            VideoFrameDatasetArguments,
+            TrainerArguments,
+            TrainingArguments,
+        )
+    )
     (
         model_args,
         data_args,
         trainer_args,
-        training_args, ) = parser.parse_args_into_dataclasses()
+        training_args,
+    ) = parser.parse_args_into_dataclasses()
     # report to custom_visualdl
     training_args.report_to = ["custom_visualdl"]
     training_args.resolution = data_args.resolution
 
     training_args.image_logging_steps = trainer_args.image_logging_steps = (
-        (math.ceil(trainer_args.image_logging_steps /
-                   training_args.logging_steps) * training_args.logging_steps)
-        if trainer_args.image_logging_steps > 0 else -1)
+        (math.ceil(trainer_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
+        if trainer_args.image_logging_steps > 0
+        else -1
+    )
 
     training_args.print_config(model_args, "Model")
     training_args.print_config(trainer_args, "Trainer")
@@ -53,16 +59,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -78,7 +82,8 @@ def main():
         subset_split=data_args.train_subset_split,
         spatial_transform=data_args.spatial_transform,
         clip_step=data_args.clip_step,
-        temporal_transform=data_args.temporal_transform, )
+        temporal_transform=data_args.temporal_transform,
+    )
     eval_dataset = VideoFrameDataset(
         data_root=data_args.eval_data_root,
         resolution=data_args.resolution,
@@ -87,13 +92,15 @@ def main():
         subset_split=data_args.eval_subset_split,
         spatial_transform=data_args.spatial_transform,
         clip_step=data_args.clip_step,
-        temporal_transform=data_args.temporal_transform, )
+        temporal_transform=data_args.temporal_transform,
+    )
 
     trainer = LatentVideoDiffusionTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        eval_dataset=eval_dataset, )
+        eval_dataset=eval_dataset,
+    )
 
     # must set recompute after trainer init
     trainer.model.set_recompute(training_args.recompute)
diff --git a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py
index 4959f59c1b1a6..f7a04f62abb77 100644
--- a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py
+++ b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py
@@ -16,31 +16,33 @@
 import os
 
 import paddle
-from lvdm import (LatentVideoDiffusion, LatentVideoDiffusionTrainer,
-                  WebVidDataset)
-from lvdm.lvdm_args_text2video import (ModelArguments, TrainerArguments,
-                                       WebVidDatasetArguments)
-from paddlenlp.trainer import (PdArgumentParser, TrainingArguments,
-                               get_last_checkpoint)
+from lvdm import LatentVideoDiffusion, LatentVideoDiffusionTrainer, WebVidDataset
+from lvdm.lvdm_args_text2video import (
+    ModelArguments,
+    TrainerArguments,
+    WebVidDatasetArguments,
+)
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
 from paddlenlp.utils.log import logger
 
 
 def main():
-    parser = PdArgumentParser((ModelArguments, WebVidDatasetArguments,
-                               TrainerArguments, TrainingArguments))
+    parser = PdArgumentParser((ModelArguments, WebVidDatasetArguments, TrainerArguments, TrainingArguments))
     (
         model_args,
         data_args,
         trainer_args,
-        training_args, ) = parser.parse_args_into_dataclasses()
+        training_args,
+    ) = parser.parse_args_into_dataclasses()
     # report to custom_visualdl
     training_args.report_to = ["custom_visualdl"]
     training_args.resolution = data_args.resolution
 
     training_args.image_logging_steps = trainer_args.image_logging_steps = (
-        (math.ceil(trainer_args.image_logging_steps /
-                   training_args.logging_steps) * training_args.logging_steps)
-        if trainer_args.image_logging_steps > 0 else -1)
+        (math.ceil(trainer_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
+        if trainer_args.image_logging_steps > 0
+        else -1
+    )
 
     training_args.print_config(model_args, "Model")
     training_args.print_config(trainer_args, "Trainer")
@@ -50,16 +52,14 @@ def main():
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (os.path.isdir(training_args.output_dir) and training_args.do_train and
-            not training_args.overwrite_output_dir):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif (last_checkpoint is not None and
-              training_args.resume_from_checkpoint is None):
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
             logger.info(
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
@@ -74,7 +74,8 @@ def main():
         video_length=data_args.video_length,
         frame_stride=data_args.frame_stride,
         spatial_transform=data_args.spatial_transform,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
     eval_dataset = WebVidDataset(
         data_root=data_args.eval_data_root,
         annotation_path=data_args.eval_annotation_path,
@@ -83,14 +84,16 @@ def main():
         video_length=data_args.video_length,
         frame_stride=data_args.frame_stride,
         spatial_transform=data_args.spatial_transform,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     trainer = LatentVideoDiffusionTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=model.tokenizer, )
+        tokenizer=model.tokenizer,
+    )
 
     # must set recompute after trainer init
     trainer.model.set_recompute(training_args.recompute)
diff --git a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
index 32134c2808903..26c629dff52ae 100644
--- a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
+++ b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
@@ -29,10 +29,10 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import \
-    fused_allreduce_gradients
-from paddle.io import (BatchSampler, DataLoader, Dataset,
-                       DistributedBatchSampler)
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
+from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
 from paddle.optimizer import AdamW
 from paddle.vision.transforms import RandomHorizontalFlip
 from paddlenlp.trainer import set_seed
@@ -41,27 +41,30 @@
 from PIL import Image
 from tqdm.auto import tqdm
 
-from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
-                         DPMSolverMultistepScheduler, UNet2DConditionModel,
-                         is_ppxformers_available)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+    is_ppxformers_available,
+)
 from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (freeze_params, unfreeze_params,
-                                        unwrap_model)
+from ppdiffusers.training_utils import freeze_params, unfreeze_params, unwrap_model
 from ppdiffusers.utils import PIL_INTERPOLATION, check_min_version
 
 check_min_version("0.16.1")
 
 
 def url_or_path_join(*path_list):
-    return (os.path.join(*path_list)
-            if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list))
+    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
 
 
-def import_model_class_from_model_name_or_path(
-        pretrained_model_name_or_path: str):
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
     try:
         text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder"))
+            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
+        )
         model_class = text_encoder_config.architectures[0]
     except Exception:
         model_class = "LDMBertModel"
@@ -70,8 +73,9 @@ def import_model_class_from_model_name_or_path(
 
         return CLIPTextModel
     elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \
-            RobertaSeriesModelWithTransformation
+        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+            RobertaSeriesModelWithTransformation,
+        )
 
         return RobertaSeriesModelWithTransformation
     elif model_class == "BertModel":
@@ -79,8 +83,9 @@ def import_model_class_from_model_name_or_path(
 
         return BertModel
     elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \
-            LDMBertModel
+        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
+            LDMBertModel,
+        )
 
         return LDMBertModel
     else:
@@ -118,25 +123,28 @@ def get_report_to(args):
 def save_progress(text_encoder, placeholder_token_ids, args, save_path):
     logger.info("Saving embeddings")
     learned_embeds = (
-        unwrap_model(text_encoder).get_input_embeddings()
-        .weight[min(placeholder_token_ids):max(placeholder_token_ids) + 1])
+        unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
+    )
     learned_embeds_dict = {args.placeholder_token: learned_embeds.detach()}
     paddle.save(learned_embeds_dict, save_path)
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
         "--save_steps",
         type=int,
         default=500,
-        help="Save learned_embeds.pdparams every X updates steps.", )
+        help="Save learned_embeds.pdparams every X updates steps.",
+    )
     parser.add_argument(
         "--only_save_embeds",
         action="store_true",
         default=True,
-        help="Save only the embeddings for the new concept.", )
+        help="Save only the embeddings for the new concept.",
+    )
     parser.add_argument(
         "--num_vectors",
         type=int,
@@ -161,70 +169,79 @@ def parse_args():
         type=str,
         default=None,
         required=True,
-        help="A folder containing the training data.", )
+        help="A folder containing the training data.",
+    )
     parser.add_argument(
         "--placeholder_token",
         type=str,
         default=None,
         required=True,
-        help="A token to use as a placeholder for the concept.", )
+        help="A token to use as a placeholder for the concept.",
+    )
     parser.add_argument(
         "--initializer_token",
         type=str,
         default=None,
         required=True,
-        help="A token to use as initializer word.", )
+        help="A token to use as initializer word.",
+    )
     parser.add_argument(
         "--learnable_property",
         type=str,
         default="object",
-        help="Choose between 'object' and 'style'", )
+        help="Choose between 'object' and 'style'",
+    )
     parser.add_argument(
         "--repeats",
         type=int,
         default=100,
-        help="How many times to repeat the training data.", )
+        help="How many times to repeat the training data.",
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
         default="text-inversion-model",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--height",
         type=int,
         default=None,
         help=(
             "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"), )
+            " height"
+        ),
+    )
     parser.add_argument(
         "--width",
         type=int,
         default=None,
         help=(
             "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"), )
+            " width"
+        ),
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         default=512,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--center_crop",
         action="store_true",
-        help="Whether to center crop images before resizing to resolution.", )
+        help="Whether to center crop images before resizing to resolution.",
+    )
     parser.add_argument(
         "--train_batch_size",
         type=int,
         default=16,
-        help="Batch size (per device) for the training dataloader.", )
+        help="Batch size (per device) for the training dataloader.",
+    )
     parser.add_argument("--num_train_epochs", type=int, default=100)
     parser.add_argument(
         "--max_train_steps",
@@ -261,19 +278,23 @@ def parse_args():
         default="constant",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'), )
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
     parser.add_argument(
         "--dataloader_num_workers",
         type=int,
         default=0,
         help=(
             "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--lr_warmup_steps",
         type=int,
         default=500,
-        help="Number of steps for the warmup in the lr scheduler.", )
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
     parser.add_argument(
         "--lr_num_cycles",
         type=int,
@@ -284,38 +305,39 @@ def parse_args():
         "--lr_power",
         type=float,
         default=1.0,
-        help="Power factor of the polynomial scheduler.", )
+        help="Power factor of the polynomial scheduler.",
+    )
     parser.add_argument(
         "--adam_beta1",
         type=float,
         default=0.9,
-        help="The beta1 parameter for the Adam optimizer.", )
+        help="The beta1 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_beta2",
         type=float,
         default=0.999,
-        help="The beta2 parameter for the Adam optimizer.", )
-    parser.add_argument(
-        "--adam_weight_decay",
-        type=float,
-        default=1e-2,
-        help="Weight decay to use.")
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
     parser.add_argument(
         "--adam_epsilon",
         type=float,
         default=1e-08,
-        help="Epsilon value for the Adam optimizer", )
-    parser.add_argument(
-        "--max_grad_norm", default=-1, type=float, help="Max gradient norm.")
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument("--max_grad_norm", default=-1, type=float, help="Max gradient norm.")
     parser.add_argument(
         "--push_to_hub",
         action="store_true",
-        help="Whether or not to push the model to the Hub.", )
+        help="Whether or not to push the model to the Hub.",
+    )
     parser.add_argument(
         "--hub_token",
         type=str,
         default=None,
-        help="The token to use to push to the Model Hub.", )
+        help="The token to use to push to the Model Hub.",
+    )
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -328,19 +350,24 @@ def parse_args():
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"), )
+            "*output_dir/logs"
+        ),
+    )
     parser.add_argument(
         "--report_to",
         type=str,
         default="visualdl",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"visualdl"`'
-            ' (default), `"tensorboard"`.'), )
+            ' (default), `"tensorboard"`.'
+        ),
+    )
     parser.add_argument(
         "--language",
         default="en",
         choices=["en", "zh", "zh_en"],
-        help="Model language.", )
+        help="Model language.",
+    )
     parser.add_argument(
         "--validation_prompt",
         type=str,
@@ -360,16 +387,15 @@ def parse_args():
         help=(
             "Run validation every X epochs. Validation consists of running the prompt"
             " `args.validation_prompt` multiple times: `args.num_validation_images`"
-            " and logging the images."), )
+            " and logging the images."
+        ),
+    )
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether or not to use xformers.", )
-    parser.add_argument(
-        "--noise_offset",
-        type=float,
-        default=0,
-        help="The scale of noise offset.")
+        help="Whether or not to use xformers.",
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
 
     args = parser.parse_args()
 
@@ -379,9 +405,7 @@ def parse_args():
     if args.language == "en":
         if "chinese-en" in args.pretrained_model_name_or_path.lower():
             args.language = "zh_en"
-            logger.info(
-                "Detect Chinese-English Model, we will set language to 'zh_en'. "
-            )
+            logger.info("Detect Chinese-English Model, we will set language to 'zh_en'. ")
         elif "chinese" in args.pretrained_model_name_or_path.lower():
             args.language = "zh"
             logger.info("Detect Chinese Model, we will set language to 'zh'. ")
@@ -486,19 +510,20 @@ def parse_args():
 
 class TextualInversionDataset(Dataset):
     def __init__(
-            self,
-            data_root,
-            tokenizer,
-            learnable_property="object",  # [object, style]
-            height=512,
-            width=512,
-            repeats=100,
-            interpolation="bicubic",
-            flip_p=0.5,
-            set="train",
-            placeholder_token="*",
-            center_crop=False,
-            language="en", ):
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        height=512,
+        width=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+        language="en",
+    ):
         self.data_root = data_root
         self.tokenizer = tokenizer
         self.learnable_property = learnable_property
@@ -514,8 +539,7 @@ def __init__(
         ext = ["png", "jpg", "jpeg", "bmp", "PNG", "JPG", "JPEG", "BMP"]
         self.image_paths = []
         for e in ext:
-            self.image_paths.extend(
-                glob.glob(os.path.join(data_root, "*." + e)))
+            self.image_paths.extend(glob.glob(os.path.join(data_root, "*." + e)))
 
         self.num_images = len(self.image_paths)
         self._length = self.num_images
@@ -562,7 +586,8 @@ def __getitem__(self, i):
             padding="max_length",
             truncation=True,
             max_length=self.tokenizer.model_max_length,
-            return_attention_mask=False, ).input_ids
+            return_attention_mask=False,
+        ).input_ids
 
         # default to score-sde preprocessing
         img = np.array(image).astype(np.uint8)
@@ -571,13 +596,12 @@ def __getitem__(self, i):
             crop = min(img.shape[0], img.shape[1])
             h, w, = (
                 img.shape[0],
-                img.shape[1], )
-            img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(w + crop
-                                                                        ) // 2]
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
 
         image = Image.fromarray(img)
-        image = image.resize(
-            (self.width, self.height), resample=self.interpolation)
+        image = image.resize((self.width, self.height), resample=self.interpolation)
 
         image = self.flip_transform(image)
         image = np.array(image).astype(np.uint8)
@@ -587,9 +611,7 @@ def __getitem__(self, i):
         return example
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -618,16 +640,13 @@ def main():
             os.makedirs(args.output_dir, exist_ok=True)
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_name = get_full_repo_name(
-                    Path(args.output_dir).name, token=args.hub_token)
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(
-                args.output_dir, clone_from=repo_name, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
 
-            with open(os.path.join(args.output_dir, ".gitignore"),
-                      "w+") as gitignore:
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
                     gitignore.write("step_*\n")
                 if "epoch_*" not in gitignore:
@@ -638,18 +657,14 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
     elif args.pretrained_model_name_or_path:
         # support windows "\"
-        tokenizer = AutoTokenizer.from_pretrained(
-            url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
+        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
     # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
 
     # Add the placeholder token in tokenizer
     placeholder_tokens = [args.placeholder_token]
     if args.num_vectors < 1:
-        raise ValueError(
-            f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}"
-        )
+        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
 
     # add dummy tokens for multi-vector
     additional_tokens = []
@@ -661,33 +676,28 @@ def main():
     if num_added_tokens != args.num_vectors:
         raise ValueError(
             f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
-            " `placeholder_token` that is not already in the tokenizer.")
+            " `placeholder_token` that is not already in the tokenizer."
+        )
 
     # Convert the initializer_token, placeholder_token to ids
-    initializer_token_ids = tokenizer.encode(
-        args.initializer_token, add_special_tokens=False)["input_ids"]
+    initializer_token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)["input_ids"]
     if len(initializer_token_ids) < 1:
-        raise ValueError(
-            "The initializer token must be a greater equal than one.")
+        raise ValueError("The initializer token must be a greater equal than one.")
 
     placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
 
-    text_encoder_cls = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path)
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
 
     text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder"))
-    text_config = (text_encoder.config if isinstance(text_encoder.config, dict)
-                   else text_encoder.config.to_dict())
-    if (text_config.get("use_attention_mask", None) is not None and
-            text_config["use_attention_mask"]):
+        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
+    )
+    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
+    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
         use_attention_mask = True
     else:
         use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
 
     # Resize the token embeddings as we are adding new special tokens to the tokenizer
     text_encoder.resize_token_embeddings(len(tokenizer))
@@ -698,8 +708,8 @@ def main():
         # we will compute mean
         for token_id in placeholder_token_ids:
             token_embeds.weight[token_id] = paddle.stack(
-                [token_embeds.weight[each]
-                 for each in initializer_token_ids]).mean(0)
+                [token_embeds.weight[each] for each in initializer_token_ids]
+            ).mean(0)
 
     # Freeze vae and unet
     freeze_params(vae.parameters())
@@ -712,14 +722,14 @@ def main():
         # unet.enable_gradient_checkpointing()
         set_recompute(text_encoder, True)
 
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
-    ):
+    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
         try:
             unet.enable_xformers_memory_efficient_attention()
         except Exception as e:
             logger.warn(
                 "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}")
+                f" correctly and a GPU is available: {e}"
+            )
 
     train_dataset = TextualInversionDataset(
         data_root=args.train_data_dir,
@@ -732,71 +742,66 @@ def main():
         center_crop=args.center_crop,
         set="train",
         language=args.language,
-        interpolation="bilinear", )
+        interpolation="bilinear",
+    )
 
     def collate_fn(examples):
         input_ids = [example["input_ids"] for example in examples]
-        pixel_values = paddle.to_tensor(
-            [example["pixel_values"] for example in examples], dtype="float32")
+        pixel_values = paddle.to_tensor([example["pixel_values"] for example in examples], dtype="float32")
         input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
+            {"input_ids": input_ids},
             padding="max_length",
             max_length=tokenizer.model_max_length,
-            return_tensors="pd", ).input_ids
+            return_tensors="pd",
+        ).input_ids
         return {
             "input_ids": input_ids,
             "pixel_values": pixel_values,
         }
 
-    train_sampler = (DistributedBatchSampler(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True)
-                     if num_processes > 1 else BatchSampler(
-                         train_dataset,
-                         batch_size=args.train_batch_size,
-                         shuffle=True))
+    train_sampler = (
+        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+        if num_processes > 1
+        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    )
     train_dataloader = DataLoader(
         train_dataset,
         batch_sampler=train_sampler,
         collate_fn=collate_fn,
-        num_workers=args.dataloader_num_workers, )
+        num_workers=args.dataloader_num_workers,
+    )
 
     # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps /
-                                      num_update_steps_per_epoch)
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     if args.scale_lr:
-        args.learning_rate = (args.learning_rate *
-                              args.gradient_accumulation_steps *
-                              args.train_batch_size * num_processes)
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
+        )
 
     # Initialize the lr_scheduler
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps *
-        args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps *
-        args.gradient_accumulation_steps,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
         num_cycles=args.lr_num_cycles,
-        power=args.lr_power, )
+        power=args.lr_power,
+    )
     # Initialize the optimizer
     optimizer = AdamW(
         learning_rate=lr_scheduler,
-        parameters=text_encoder.get_input_embeddings().parameters(
-        ),  # only optimize the embeddings
+        parameters=text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
         beta1=args.adam_beta1,
         beta2=args.adam_beta2,
         weight_decay=args.adam_weight_decay,
         epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
-        if args.max_grad_norm > 0 else None, )
+        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+    )
 
     if num_processes > 1:
         text_encoder = paddle.DataParallel(text_encoder)
@@ -809,35 +814,27 @@ def collate_fn(examples):
         writer = get_report_to(args)
 
     # Train!
-    total_batch_size = (args.train_batch_size * num_processes *
-                        args.gradient_accumulation_steps)
+    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not is_main_process)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
     progress_bar.set_description("Train Steps")
     global_step = 0
 
     # keep original embeddings as reference
-    orig_embeds_params = (
-        unwrap_model(text_encoder).get_input_embeddings().weight.clone())
+    orig_embeds_params = unwrap_model(text_encoder).get_input_embeddings().weight.clone()
 
-    index_no_updates = paddle.ones((len(tokenizer), ), dtype=paddle.bool)
-    index_no_updates[min(placeholder_token_ids):max(placeholder_token_ids) +
-                     1] = False
+    index_no_updates = paddle.ones((len(tokenizer),), dtype=paddle.bool)
+    index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
     index_no_updates = index_no_updates.cast("int64").sum()
     # Keep vae and unet in eval model as we don't train these
     vae.eval()
@@ -855,20 +852,19 @@ def collate_fn(examples):
             if args.noise_offset:
                 # https://www.crosslabs.org//blog/diffusion-with-offset-noise
                 noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1),
-                    dtype=latents.dtype)
+                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
+                )
             batch_size = latents.shape[0]
             # Sample a random timestep for each image
-            timesteps = paddle.randint(
-                0, noise_scheduler.config.num_train_timesteps,
-                (batch_size, )).cast("int64")
+            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
 
             # Add noise to the latents according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-            if num_processes > 1 and (args.gradient_checkpointing or (
-                (step + 1) % args.gradient_accumulation_steps != 0)):
+            if num_processes > 1 and (
+                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
+            ):
                 # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
                 # gradient_checkpointing, no_sync every where
                 # gradient_checkpointing + grad_acc, no_sync every where
@@ -876,35 +872,29 @@ def collate_fn(examples):
                 text_encoder_ctx_manager = text_encoder.no_sync()
             else:
                 # unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                text_encoder_ctx_manager = (contextlib.nullcontext()
-                                            if sys.version_info >= (3, 7) else
-                                            contextlib.suppress())
+                text_encoder_ctx_manager = (
+                    contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+                )
 
             with text_encoder_ctx_manager:
                 # Get the text embedding for conditioning
                 if use_attention_mask:
-                    attention_mask = (batch["input_ids"] !=
-                                      tokenizer.pad_token_id).cast("int64")
+                    attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
                 else:
                     attention_mask = None
-                encoder_hidden_states = text_encoder(
-                    batch["input_ids"], attention_mask=attention_mask)[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
 
                 # with unet_ctx_manager:
                 # Predict the noise or sample
-                model_pred = unet(noisy_latents, timesteps,
-                                  encoder_hidden_states).sample
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
                 elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise,
-                                                          timesteps)
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
-                    raise ValueError(
-                        f"Unknown prediction type {noise_scheduler.config.prediction_type}"
-                    )
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
                 loss = F.mse_loss(model_pred, target, reduction="mean")
 
                 if args.gradient_accumulation_steps > 1:
@@ -914,18 +904,17 @@ def collate_fn(examples):
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if num_processes > 1 and args.gradient_checkpointing:
                     fused_allreduce_gradients(
-                        unwrap_model(text_encoder).get_input_embeddings()
-                        .parameters(),
-                        None, )
+                        unwrap_model(text_encoder).get_input_embeddings().parameters(),
+                        None,
+                    )
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.clear_grad()
                 # Let's make sure we don't update any embedding weights besides the newly added token
                 with paddle.no_grad():
-                    unwrap_model(text_encoder).get_input_embeddings(
-                    ).weight[:
-                             index_no_updates] = orig_embeds_params[:
-                                                                    index_no_updates]
+                    unwrap_model(text_encoder).get_input_embeddings().weight[:index_no_updates] = orig_embeds_params[
+                        :index_no_updates
+                    ]
 
                 progress_bar.update(1)
                 global_step += 1
@@ -945,19 +934,19 @@ def collate_fn(examples):
                     if global_step % args.save_steps == 0:
                         save_path = os.path.join(
                             args.output_dir,
-                            f"learned_embeds-steps-{global_step}.pdparams", )
-                        save_progress(text_encoder, placeholder_token_ids, args,
-                                      save_path)
+                            f"learned_embeds-steps-{global_step}.pdparams",
+                        )
+                        save_progress(text_encoder, placeholder_token_ids, args, save_path)
 
                 if global_step >= args.max_train_steps:
                     break
 
         if is_main_process:
-            if (args.validation_prompt is not None and
-                    epoch % args.validation_epochs == 0):
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
                 logger.info(
                     f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}.")
+                    f" {args.validation_prompt}."
+                )
                 # create pipeline
                 pipeline = DiffusionPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
@@ -965,29 +954,27 @@ def collate_fn(examples):
                     tokenizer=tokenizer,
                     paddle_dtype=paddle_dtype,
                     safety_checker=None,
-                    requires_safety_checker=False, )
-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-                    pipeline.scheduler.config)
+                    requires_safety_checker=False,
+                )
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
                 pipeline.set_progress_bar_config(disable=True)
 
                 # run inference
-                generator = (paddle.Generator().manual_seed(args.seed)
-                             if args.seed else None)
+                generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
                 images = [
                     pipeline(
                         args.validation_prompt,
                         num_inference_steps=25,
-                        generator=generator, ).images[0]
+                        generator=generator,
+                    ).images[0]
                     for _ in range(args.num_validation_images)
                 ]
                 np_images = np.stack([np.asarray(img) for img in images])
 
                 if args.report_to == "tensorboard":
-                    writer.add_images(
-                        "test", np_images, epoch, dataformats="NHWC")
+                    writer.add_images("test", np_images, epoch, dataformats="NHWC")
                 else:
-                    writer.add_image(
-                        "test", np_images, epoch, dataformats="NHWC")
+                    writer.add_image("test", np_images, epoch, dataformats="NHWC")
 
                 del pipeline
                 gc.collect()
@@ -998,9 +985,7 @@ def collate_fn(examples):
     if is_main_process:
         writer.close()
         if args.push_to_hub and args.only_save_embeds:
-            logger.warn(
-                "Enabling full model saving because --push_to_hub=True was specified."
-            )
+            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
             save_full_model = True
         else:
             save_full_model = not args.only_save_embeds
@@ -1008,17 +993,15 @@ def collate_fn(examples):
             pipeline = DiffusionPipeline.from_pretrained(
                 args.pretrained_model_name_or_path,
                 text_encoder=unwrap_model(text_encoder),
-                tokenizer=tokenizer, )
+                tokenizer=tokenizer,
+            )
             pipeline.save_pretrained(args.output_dir)
         # Save the newly trained embeddings
         save_path = os.path.join(args.output_dir, "learned_embeds.pdparams")
         save_progress(text_encoder, placeholder_token_ids, args, save_path)
 
         if args.push_to_hub:
-            repo.push_to_hub(
-                commit_message="End of training",
-                blocking=False,
-                auto_lfs_prune=True)
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
 
 if __name__ == "__main__":
diff --git a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
index a157a1f5c1f04..80af56cbf7391 100644
--- a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
+++ b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
@@ -73,8 +73,7 @@ def get_report_to(args):
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Simple example of a training script.")
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
         "--dataset_name",
         type=str,
@@ -83,7 +82,8 @@ def parse_args():
             "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
             " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
             " or to a folder containing files that HF Datasets can understand."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--dataset_config_name",
         type=str,
@@ -104,7 +104,8 @@ def parse_args():
             "A folder containing the training data. Folder contents must follow the structure described in"
             " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
             " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -124,7 +125,9 @@ def parse_args():
         default=64,
         help=(
             "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"), )
+            " resolution"
+        ),
+    )
     parser.add_argument(
         "--center_crop",
         default=False,
@@ -132,40 +135,48 @@ def parse_args():
         help=(
             "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
             " cropped. The images will be resized to the resolution first before cropping."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--random_flip",
         default=False,
         action="store_true",
-        help="whether to randomly flip images horizontally", )
+        help="whether to randomly flip images horizontally",
+    )
     parser.add_argument(
         "--train_batch_size",
         type=int,
         default=16,
-        help="Batch size (per device) for the training dataloader.", )
+        help="Batch size (per device) for the training dataloader.",
+    )
     parser.add_argument(
         "--eval_batch_size",
         type=int,
         default=16,
-        help="The number of images to generate for evaluation.", )
+        help="The number of images to generate for evaluation.",
+    )
     parser.add_argument(
         "--dataloader_num_workers",
         type=int,
         default=0,
         help=(
             "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
-            " process."), )
+            " process."
+        ),
+    )
     parser.add_argument("--num_epochs", type=int, default=100)
     parser.add_argument(
         "--save_images_epochs",
         type=int,
         default=10,
-        help="How often to save images during training.", )
+        help="How often to save images during training.",
+    )
     parser.add_argument(
         "--save_model_epochs",
         type=int,
         default=10,
-        help="How often to save the model during training.", )
+        help="How often to save the model during training.",
+    )
     parser.add_argument(
         "--gradient_accumulation_steps",
         type=int,
@@ -184,34 +195,40 @@ def parse_args():
         default="cosine",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'), )
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
     parser.add_argument(
         "--lr_warmup_steps",
         type=int,
         default=500,
-        help="Number of steps for the warmup in the lr scheduler.", )
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
     parser.add_argument(
         "--adam_beta1",
         type=float,
         default=0.95,
-        help="The beta1 parameter for the Adam optimizer.", )
+        help="The beta1 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_beta2",
         type=float,
         default=0.999,
-        help="The beta2 parameter for the Adam optimizer.", )
+        help="The beta2 parameter for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_weight_decay",
         type=float,
         default=1e-6,
-        help="Weight decay magnitude for the Adam optimizer.", )
+        help="Weight decay magnitude for the Adam optimizer.",
+    )
     parser.add_argument(
         "--adam_epsilon",
         type=float,
         default=1e-08,
-        help="Epsilon value for the Adam optimizer.", )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+        help="Epsilon value for the Adam optimizer.",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
         "--use_ema",
         action="store_true",
@@ -221,26 +238,31 @@ def parse_args():
         "--ema_inv_gamma",
         type=float,
         default=1.0,
-        help="The inverse gamma value for the EMA decay.", )
+        help="The inverse gamma value for the EMA decay.",
+    )
     parser.add_argument(
         "--ema_power",
         type=float,
         default=3 / 4,
-        help="The power value for the EMA decay.", )
+        help="The power value for the EMA decay.",
+    )
     parser.add_argument(
         "--ema_max_decay",
         type=float,
         default=0.9999,
-        help="The maximum decay magnitude for EMA.", )
+        help="The maximum decay magnitude for EMA.",
+    )
     parser.add_argument(
         "--push_to_hub",
         action="store_true",
-        help="Whether or not to push the model to the Hub.", )
+        help="Whether or not to push the model to the Hub.",
+    )
     parser.add_argument(
         "--hub_token",
         type=str,
         default=None,
-        help="The token to use to push to the Model Hub.", )
+        help="The token to use to push to the Model Hub.",
+    )
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -250,7 +272,8 @@ def parse_args():
     parser.add_argument(
         "--hub_private_repo",
         action="store_true",
-        help="Whether or not to create a private repository.", )
+        help="Whether or not to create a private repository.",
+    )
     parser.add_argument(
         "--logger",
         type=str,
@@ -259,14 +282,17 @@ def parse_args():
         help=(
             "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
             " for experiment tracking and logging of model metrics and model checkpoints"
-        ), )
+        ),
+    )
     parser.add_argument(
         "--logging_dir",
         type=str,
         default="logs",
         help=(
             "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."), )
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
     parser.add_argument(
         "--prediction_type",
         type=str,
@@ -283,7 +309,9 @@ def parse_args():
         default=500,
         help=(
             "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
-            " training using `--resume_from_checkpoint`."), )
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
     parser.add_argument(
         "--checkpoints_total_limit",
         type=int,
@@ -291,29 +319,24 @@ def parse_args():
         help=(
             "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
             " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
-            " for more docs"), )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="A seed for reproducible training.")
+            " for more docs"
+        ),
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention",
         action="store_true",
-        help="Whether or not to use xformers.", )
+        help="Whether or not to use xformers.",
+    )
     args = parser.parse_args()
 
     if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError(
-            "You must specify either a dataset name from the hub or a train data directory."
-        )
+        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
 
     return args
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -349,8 +372,7 @@ def save_model_hook(models, weights, output_dir):
 
     def load_model_hook(models, input_dir):
         if args.use_ema:
-            load_model = EMAModel.from_pretrained(
-                os.path.join(input_dir, "unet_ema"), UNet2DModel)
+            load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel)
             ema_model.load_state_dict(load_model.state_dict())
             del load_model
 
@@ -359,8 +381,7 @@ def load_model_hook(models, input_dir):
             model = models.pop()
 
             # load ppdiffusers style into model
-            load_model = UNet2DModel.from_pretrained(
-                input_dir, subfolder="unet")
+            load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet")
             model.register_to_config(**load_model.config)
 
             model.load_state_dict(load_model.state_dict())
@@ -374,21 +395,20 @@ def load_model_hook(models, input_dir):
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO, )
+        level=logging.INFO,
+    )
 
     # Handle the repository creation
     if is_main_process:
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_name = get_full_repo_name(
-                    Path(args.output_dir).name, token=args.hub_token)
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
             # repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
 
-            with open(os.path.join(args.output_dir, ".gitignore"),
-                      "w+") as gitignore:
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
                     gitignore.write("step_*\n")
                 if "epoch_*" not in gitignore:
@@ -410,14 +430,17 @@ def load_model_hook(models, input_dir):
                 "DownBlock2D",
                 "DownBlock2D",
                 "AttnDownBlock2D",
-                "DownBlock2D", ),
+                "DownBlock2D",
+            ),
             up_block_types=(
                 "UpBlock2D",
                 "AttnUpBlock2D",
                 "UpBlock2D",
                 "UpBlock2D",
                 "UpBlock2D",
-                "UpBlock2D", ), )
+                "UpBlock2D",
+            ),
+        )
     else:
         config = UNet2DModel.load_config(args.model_config_name_or_path)
         model = UNet2DModel.from_config(config)
@@ -431,28 +454,30 @@ def load_model_hook(models, input_dir):
             inv_gamma=args.ema_inv_gamma,
             power=args.ema_power,
             model_cls=UNet2DModel,
-            model_config=model.config, )
+            model_config=model.config,
+        )
 
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(
-    ):
+    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
         try:
             model.enable_xformers_memory_efficient_attention()
         except Exception as e:
             logger.warn(
                 "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}")
+                f" correctly and a GPU is available: {e}"
+            )
     # Initialize the scheduler
-    accepts_prediction_type = "prediction_type" in set(
-        inspect.signature(DDPMScheduler.__init__).parameters.keys())
+    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
     if accepts_prediction_type:
         noise_scheduler = DDPMScheduler(
             num_train_timesteps=args.ddpm_num_steps,
             beta_schedule=args.ddpm_beta_schedule,
-            prediction_type=args.prediction_type, )
+            prediction_type=args.prediction_type,
+        )
     else:
         noise_scheduler = DDPMScheduler(
             num_train_timesteps=args.ddpm_num_steps,
-            beta_schedule=args.ddpm_beta_schedule, )
+            beta_schedule=args.ddpm_beta_schedule,
+        )
 
     # Get the datasets: you can either provide your own training and evaluation files (see below)
     # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
@@ -464,31 +489,30 @@ def load_model_hook(models, input_dir):
             args.dataset_name,
             args.dataset_config_name,
             cache_dir=args.cache_dir,
-            split="train", )
+            split="train",
+        )
     else:
         dataset = load_dataset(
             "imagefolder",
             data_dir=args.train_data_dir,
             cache_dir=args.cache_dir,
-            split="train", )
+            split="train",
+        )
         # See more about loading custom images at
 
     # Preprocessing the datasets and DataLoaders creation.
-    augmentations = transforms.Compose([
-        transforms.Resize(
-            args.resolution, interpolation="bilinear"),
-        transforms.CenterCrop(args.resolution)
-        if args.center_crop else transforms.RandomCrop(args.resolution),
-        transforms.RandomHorizontalFlip()
-        if args.random_flip else transforms.Lambda(lambda x: x),
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5]),
-    ])
+    augmentations = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation="bilinear"),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
 
     def transform_images(examples):
-        images = [
-            augmentations(image.convert("RGB")) for image in examples["image"]
-        ]
+        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
         return {"input": images}
 
     # logger.info(f"Dataset size: {len(dataset)}")
@@ -498,7 +522,8 @@ def transform_images(examples):
         dataset,
         batch_size=args.train_batch_size,
         shuffle=True,
-        num_workers=args.dataloader_num_workers, )
+        num_workers=args.dataloader_num_workers,
+    )
 
     if num_processes > 1:
         model = paddle.DataParallel(model)
@@ -507,9 +532,9 @@ def transform_images(examples):
     lr_scheduler = get_scheduler(
         args.lr_scheduler,
         learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps *
-        args.gradient_accumulation_steps,
-        num_training_steps=(len(train_dataloader) * args.num_epochs), )
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs),
+    )
 
     # Initialize the optimizer
     optimizer = paddle.optimizer.AdamW(
@@ -519,8 +544,8 @@ def transform_images(examples):
         beta2=args.adam_beta2,
         weight_decay=args.adam_weight_decay,
         epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)
-        if args.max_grad_norm > 0 else None, )
+        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
+    )
 
     if is_main_process:
         logger.info("-----------  Configuration Arguments -----------")
@@ -530,22 +555,16 @@ def transform_images(examples):
         writer = get_report_to(args)
 
     # Prepare everything with our `accelerator`.
-    total_batch_size = (args.train_batch_size * num_processes *
-                        args.gradient_accumulation_steps)
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps)
+    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     max_train_steps = args.num_epochs * num_update_steps_per_epoch
 
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(dataset)}")
     logger.info(f"  Num Epochs = {args.num_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {max_train_steps}")
 
     global_step = 0
@@ -554,8 +573,7 @@ def transform_images(examples):
     # Train!
     for epoch in range(first_epoch, args.num_epochs):
         model.train()
-        progress_bar = tqdm(
-            total=num_update_steps_per_epoch, disable=not is_main_process)
+        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not is_main_process)
         progress_bar.set_description(f"Epoch {epoch}")
         for step, batch in enumerate(train_dataloader):
             clean_images = batch["input"]
@@ -563,34 +581,30 @@ def transform_images(examples):
             noise = paddle.randn(clean_images.shape)
             bsz = clean_images.shape[0]
             # Sample a random timestep for each image
-            timesteps = paddle.randint(
-                0, noise_scheduler.config.num_train_timesteps,
-                (bsz, )).cast("int64")
+            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,)).cast("int64")
 
             # Add noise to the clean images according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
-            noisy_images = noise_scheduler.add_noise(clean_images, noise,
-                                                     timesteps)
+            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
 
             # Predict the noise residual
             model_output = model(noisy_images, timesteps).sample
 
             if args.prediction_type == "epsilon":
-                loss = F.mse_loss(model_output,
-                                  noise)  # this could have different weights!
+                loss = F.mse_loss(model_output, noise)  # this could have different weights!
             elif args.prediction_type == "sample":
                 alpha_t = _extract_into_tensor(
                     noise_scheduler.alphas_cumprod,
                     timesteps,
-                    (clean_images.shape[0], 1, 1, 1), )
+                    (clean_images.shape[0], 1, 1, 1),
+                )
                 snr_weights = alpha_t / (1 - alpha_t)
                 loss = snr_weights * F.mse_loss(
                     model_output, clean_images, reduction="none"
                 )  # use SNR weighting from distillation paper
                 loss = loss.mean()
             else:
-                raise ValueError(
-                    f"Unsupported prediction type: {args.prediction_type}")
+                raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
 
             loss.backward()
 
@@ -607,13 +621,10 @@ def transform_images(examples):
 
             if global_step % args.checkpointing_steps == 0:
                 if is_main_process:
-                    save_path = os.path.join(args.output_dir,
-                                             f"checkpoint-{global_step}")
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                     if args.use_ema:
-                        unwrap_model(ema_model).save_pretrained(
-                            os.path.join(save_path, "unet_ema"))
-                    unwrap_model(model).save_pretrained(
-                        os.path.join(save_path, "unet"))
+                        unwrap_model(ema_model).save_pretrained(os.path.join(save_path, "unet_ema"))
+                    unwrap_model(model).save_pretrained(os.path.join(save_path, "unet"))
 
                     logger.info(f"Saved state to {save_path}")
 
@@ -638,7 +649,8 @@ def transform_images(examples):
                     ema_model.copy_to(unet.parameters())
                 pipeline = DDPMPipeline(
                     unet=unet,
-                    scheduler=noise_scheduler, )
+                    scheduler=noise_scheduler,
+                )
 
                 generator = paddle.Generator().manual_seed(0)
                 # run pipeline in inference (sample random noise and denoise)
@@ -646,7 +658,8 @@ def transform_images(examples):
                     generator=generator,
                     batch_size=args.eval_batch_size,
                     num_inference_steps=args.ddpm_num_inference_steps,
-                    output_type="numpy", ).images
+                    output_type="numpy",
+                ).images
 
                 if args.use_ema:
                     ema_model.restore(unet.parameters())
@@ -657,13 +670,15 @@ def transform_images(examples):
                         "test",
                         images_processed.transpose(0, 3, 1, 2),
                         epoch,
-                        dataformats="NHWC", )
+                        dataformats="NHWC",
+                    )
                 else:
                     writer.add_image(
                         "test",
                         images_processed.transpose(0, 3, 1, 2),
                         epoch,
-                        dataformats="NHWC", )
+                        dataformats="NHWC",
+                    )
 
             if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
                 # save the model
@@ -676,7 +691,8 @@ def transform_images(examples):
 
                 pipeline = DDPMPipeline(
                     unet=unet,
-                    scheduler=noise_scheduler, )
+                    scheduler=noise_scheduler,
+                )
 
                 pipeline.save_pretrained(args.output_dir)
 
diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py
index f8c3b7f6ce1f4..f86f792718938 100644
--- a/ppdiffusers/ppdiffusers/__init__.py
+++ b/ppdiffusers/ppdiffusers/__init__.py
@@ -17,13 +17,26 @@
 from . import patches
 from .configuration_utils import ConfigMixin
 from .utils import (
-    OptionalDependencyNotAvailable, is_einops_available,
-    is_fastdeploy_available, is_inflect_available, is_k_diffusion_available,
-    is_k_diffusion_version, is_librosa_available, is_note_seq_available,
-    is_paddle_available, is_paddle_version, is_paddlenlp_available,
-    is_paddlenlp_version, is_ppxformers_available, is_safetensors_available,
-    is_scipy_available, is_torch_available, is_unidecode_available,
-    is_visualdl_available, logging)
+    OptionalDependencyNotAvailable,
+    is_einops_available,
+    is_fastdeploy_available,
+    is_inflect_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_librosa_available,
+    is_note_seq_available,
+    is_paddle_available,
+    is_paddle_version,
+    is_paddlenlp_available,
+    is_paddlenlp_version,
+    is_ppxformers_available,
+    is_safetensors_available,
+    is_scipy_available,
+    is_torch_available,
+    is_unidecode_available,
+    is_visualdl_available,
+    logging,
+)
 from .version import VERSION as __version__
 
 try:
@@ -41,32 +54,75 @@
     from .utils.dummy_paddle_objects import *  # noqa F403
 else:
     from .models import (
-        AutoencoderKL, ControlNetModel, LitEma, LVDMAutoencoderKL,
-        LVDMUNet3DModel, ModelMixin, MultiAdapter, PriorTransformer, T2IAdapter,
-        T5FilmDecoder, Transformer2DModel, UNet1DModel, UNet2DConditionModel,
-        UNet2DModel, UNet3DConditionModel, VQModel)
+        AutoencoderKL,
+        ControlNetModel,
+        LitEma,
+        LVDMAutoencoderKL,
+        LVDMUNet3DModel,
+        ModelMixin,
+        MultiAdapter,
+        PriorTransformer,
+        T2IAdapter,
+        T5FilmDecoder,
+        Transformer2DModel,
+        UNet1DModel,
+        UNet2DConditionModel,
+        UNet2DModel,
+        UNet3DConditionModel,
+        VQModel,
+    )
     from .optimization import (
-        get_constant_schedule, get_constant_schedule_with_warmup,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
         get_cosine_schedule_with_warmup,
         get_cosine_with_hard_restarts_schedule_with_warmup,
         get_linear_schedule_with_warmup,
-        get_polynomial_decay_schedule_with_warmup, get_scheduler)
+        get_polynomial_decay_schedule_with_warmup,
+        get_scheduler,
+    )
     from .pipelines import (
-        AudioPipelineOutput, DanceDiffusionPipeline, DDIMPipeline, DDPMPipeline,
-        DiffusionPipeline, DiTPipeline, ImagePipelineOutput, KarrasVePipeline,
-        LDMPipeline, LDMSuperResolutionPipeline, PNDMPipeline, RePaintPipeline,
-        ScoreSdeVePipeline, TextPipelineOutput)
+        AudioPipelineOutput,
+        DanceDiffusionPipeline,
+        DDIMPipeline,
+        DDPMPipeline,
+        DiffusionPipeline,
+        DiTPipeline,
+        ImagePipelineOutput,
+        KarrasVePipeline,
+        LDMPipeline,
+        LDMSuperResolutionPipeline,
+        PNDMPipeline,
+        RePaintPipeline,
+        ScoreSdeVePipeline,
+        TextPipelineOutput,
+    )
     from .schedulers import (
-        DDIMInverseScheduler, DDIMScheduler, DDPMScheduler,
-        DEISMultistepScheduler, DPMSolverMultistepScheduler,
-        DPMSolverSinglestepScheduler, DPMSolverUniDiffuserScheduler,
-        EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-        HeunDiscreteScheduler, IPNDMScheduler, KarrasVeScheduler,
-        KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler, PNDMScheduler,
-        RePaintScheduler, SchedulerMixin, ScoreSdeVeScheduler, UnCLIPScheduler,
-        UniPCMultistepScheduler, VQDiffusionScheduler)
-    from .schedulers.preconfig import (PreconfigEulerAncestralDiscreteScheduler,
-                                       PreconfigLMSDiscreteScheduler)
+        DDIMInverseScheduler,
+        DDIMScheduler,
+        DDPMScheduler,
+        DEISMultistepScheduler,
+        DPMSolverMultistepScheduler,
+        DPMSolverSinglestepScheduler,
+        DPMSolverUniDiffuserScheduler,
+        EulerAncestralDiscreteScheduler,
+        EulerDiscreteScheduler,
+        HeunDiscreteScheduler,
+        IPNDMScheduler,
+        KarrasVeScheduler,
+        KDPM2AncestralDiscreteScheduler,
+        KDPM2DiscreteScheduler,
+        PNDMScheduler,
+        RePaintScheduler,
+        SchedulerMixin,
+        ScoreSdeVeScheduler,
+        UnCLIPScheduler,
+        UniPCMultistepScheduler,
+        VQDiffusionScheduler,
+    )
+    from .schedulers.preconfig import (
+        PreconfigEulerAncestralDiscreteScheduler,
+        PreconfigLMSDiscreteScheduler,
+    )
     from .training_utils import EMAModel
 
 try:
@@ -84,36 +140,58 @@
     from .utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
 else:
     from .pipelines import (
-        AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, AudioLDMPipeline,
-        CycleDiffusionPipeline, IFImg2ImgPipeline,
-        IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline,
-        IFInpaintingSuperResolutionPipeline, IFPipeline,
-        IFSuperResolutionPipeline, LDMTextToImagePipeline,
-        LVDMTextToVideoPipeline, LVDMUncondPipeline, PaintByExamplePipeline,
-        SemanticStableDiffusionPipeline, StableDiffusionAdapterPipeline,
+        AltDiffusionImg2ImgPipeline,
+        AltDiffusionPipeline,
+        AudioLDMPipeline,
+        CycleDiffusionPipeline,
+        IFImg2ImgPipeline,
+        IFImg2ImgSuperResolutionPipeline,
+        IFInpaintingPipeline,
+        IFInpaintingSuperResolutionPipeline,
+        IFPipeline,
+        IFSuperResolutionPipeline,
+        LDMTextToImagePipeline,
+        LVDMTextToVideoPipeline,
+        LVDMUncondPipeline,
+        PaintByExamplePipeline,
+        SemanticStableDiffusionPipeline,
+        StableDiffusionAdapterPipeline,
         StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionControlNetPipeline, StableDiffusionDepth2ImgPipeline,
-        StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionDepth2ImgPipeline,
+        StableDiffusionImageVariationPipeline,
+        StableDiffusionImg2ImgPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionInpaintPipelineLegacy,
         StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline, StableDiffusionMegaPipeline,
-        StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline,
-        StableDiffusionPipeline, StableDiffusionPipelineAllinOne,
-        StableDiffusionPipelineSafe, StableDiffusionPix2PixZeroPipeline,
-        StableDiffusionSAGPipeline, StableDiffusionUpscalePipeline,
-        StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline,
-        TextToVideoSDPipeline, TextToVideoZeroPipeline,
-        UnCLIPImageVariationPipeline, UnCLIPPipeline, UniDiffuserPipeline,
+        StableDiffusionLatentUpscalePipeline,
+        StableDiffusionMegaPipeline,
+        StableDiffusionModelEditingPipeline,
+        StableDiffusionPanoramaPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionPipelineAllinOne,
+        StableDiffusionPipelineSafe,
+        StableDiffusionPix2PixZeroPipeline,
+        StableDiffusionSAGPipeline,
+        StableDiffusionUpscalePipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+        TextToVideoSDPipeline,
+        TextToVideoZeroPipeline,
+        UnCLIPImageVariationPipeline,
+        UnCLIPPipeline,
+        UniDiffuserPipeline,
         VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline, VQDiffusionPipeline)
-    from .pipelines.latent_diffusion.pipeline_latent_diffusion import \
-        LDMBertModel
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+        VQDiffusionPipeline,
+    )
+    from .pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
     from .pipelines.unidiffuser.caption_decoder import CaptionDecoder
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_k_diffusion_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import *  # noqa F403
@@ -121,21 +199,22 @@
     from .pipelines import StableDiffusionKDiffusionPipeline
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_fastdeploy_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import *  # noqa F403
 else:
-    from .pipelines import (FastDeployCycleDiffusionPipeline,
-                            FastDeployStableDiffusionControlNetPipeline,
-                            FastDeployStableDiffusionImageVariationPipeline,
-                            FastDeployStableDiffusionImg2ImgPipeline,
-                            FastDeployStableDiffusionInpaintPipeline,
-                            FastDeployStableDiffusionInpaintPipelineLegacy,
-                            FastDeployStableDiffusionMegaPipeline,
-                            FastDeployStableDiffusionPipeline,
-                            FastDeployStableDiffusionUpscalePipeline)
+    from .pipelines import (
+        FastDeployCycleDiffusionPipeline,
+        FastDeployStableDiffusionControlNetPipeline,
+        FastDeployStableDiffusionImageVariationPipeline,
+        FastDeployStableDiffusionImg2ImgPipeline,
+        FastDeployStableDiffusionInpaintPipeline,
+        FastDeployStableDiffusionInpaintPipelineLegacy,
+        FastDeployStableDiffusionMegaPipeline,
+        FastDeployStableDiffusionPipeline,
+        FastDeployStableDiffusionUpscalePipeline,
+    )
 
 try:
     if not (is_paddle_available() and is_librosa_available()):
@@ -146,8 +225,7 @@
     from .pipelines import AudioDiffusionPipeline, Mel
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_note_seq_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *  # noqa F403
@@ -155,8 +233,7 @@
     from .pipelines import SpectrogramDiffusionPipeline
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_einops_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils.dummy_paddle_and_paddlenlp_and_einops_objects import *  # noqa F403
diff --git a/ppdiffusers/ppdiffusers/commands/env.py b/ppdiffusers/ppdiffusers/commands/env.py
index a020de6813b7d..0ad95fd647340 100644
--- a/ppdiffusers/ppdiffusers/commands/env.py
+++ b/ppdiffusers/ppdiffusers/commands/env.py
@@ -57,9 +57,7 @@ def run(self):
             "Using distributed or parallel set-up in script?": "<fill in>",
         }
 
-        print(
-            "\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n"
-        )
+        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
         print(self.format_dict(info))
 
         return info
diff --git a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
index d14e14711dedc..7575e5902a50e 100644
--- a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
+++ b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
@@ -20,10 +20,8 @@
 
 
 def main():
-    parser = ArgumentParser(
-        "PPDiffusers CLI tool", usage="ppdiffusers-cli <command> [<args>]")
-    commands_parser = parser.add_subparsers(
-        help="ppdiffusers-cli command helpers")
+    parser = ArgumentParser("PPDiffusers CLI tool", usage="ppdiffusers-cli <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="ppdiffusers-cli command helpers")
 
     # Register commands
     EnvironmentCommand.register_subcommand(commands_parser)
diff --git a/ppdiffusers/ppdiffusers/configuration_utils.py b/ppdiffusers/ppdiffusers/configuration_utils.py
index 2c5d4e88c84e7..551fb118afa9e 100644
--- a/ppdiffusers/ppdiffusers/configuration_utils.py
+++ b/ppdiffusers/ppdiffusers/configuration_utils.py
@@ -33,9 +33,16 @@
 import numpy as np
 import paddle
 
-from .utils import (DIFFUSERS_CACHE, PPDIFFUSERS_CACHE, DummyObject,
-                    bos_hf_download, deprecate, extract_commit_hash,
-                    http_user_agent, logging)
+from .utils import (
+    DIFFUSERS_CACHE,
+    PPDIFFUSERS_CACHE,
+    DummyObject,
+    bos_hf_download,
+    deprecate,
+    extract_commit_hash,
+    http_user_agent,
+    logging,
+)
 from .utils.constants import FROM_HF_HUB
 from .version import VERSION as __version__
 
@@ -54,36 +61,25 @@ def __init__(self, *args, **kwargs):
         self.__frozen = True
 
     def __delitem__(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
-        )
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
 
     def setdefault(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
-        )
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
 
     def pop(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
 
     def update(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
-        )
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
 
     def __setattr__(self, name, value):
         if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(
-                f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance."
-            )
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
         super().__setattr__(name, value)
 
     def __setitem__(self, name, value):
         if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(
-                f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance."
-            )
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
         super().__setitem__(name, value)
 
 
@@ -112,9 +108,7 @@ class ConfigMixin:
 
     def register_to_config(self, **kwargs):
         if self.config_name is None:
-            raise NotImplementedError(
-                f"Make sure that {self.__class__} has defined a class name `config_name`"
-            )
+            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
         # Special case for `kwargs` used in deprecation warning added to schedulers
         # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
         # or solve in a more general way.
@@ -124,9 +118,8 @@ def register_to_config(self, **kwargs):
             internal_dict = kwargs
         else:
             previous_dict = dict(self._internal_dict)
-            internal_dict = { ** self._internal_dict, ** kwargs}
-            logger.debug(
-                f"Updating config from {previous_dict} to {internal_dict}")
+            internal_dict = {**self._internal_dict, **kwargs}
+            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
 
         self._internal_dict = FrozenDict(internal_dict)
 
@@ -137,8 +130,7 @@ def __getattr__(self, name: str) -> Any:
         https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
         """
 
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(
-            self.__dict__["_internal_dict"], name)
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
         is_attribute = name in self.__dict__
 
         if is_in_config and not is_attribute:
@@ -147,18 +139,19 @@ def __getattr__(self, name: str) -> Any:
                 "direct config name access",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             return self._internal_dict[name]
 
-        raise AttributeError(
-            f"'{type(self).__name__}' object has no attribute '{name}'")
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
     def save_config(
-            self,
-            save_directory: Union[str, os.PathLike],
-            push_to_hub: bool=False,
-            to_diffusers=False,
-            **kwargs, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        push_to_hub: bool = False,
+        to_diffusers=False,
+        **kwargs,
+    ):
         """
         Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
         [`~ConfigMixin.from_config`] class method.
@@ -168,9 +161,7 @@ def save_config(
                 Directory where the configuration JSON file will be saved (will be created if it does not exist).
         """
         if os.path.isfile(save_directory):
-            raise AssertionError(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
         os.makedirs(save_directory, exist_ok=True)
 
@@ -182,10 +173,11 @@ def save_config(
 
     @classmethod
     def from_config(
-            cls,
-            config: Union[FrozenDict, Dict[str, Any]]=None,
-            return_unused_kwargs=False,
-            **kwargs, ):
+        cls,
+        config: Union[FrozenDict, Dict[str, Any]] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
         r"""
         Instantiate a Python class from a config dictionary
 
@@ -222,9 +214,7 @@ def from_config(
             config = kwargs.pop("pretrained_model_name_or_path")
 
         if config is None:
-            raise ValueError(
-                "Please make sure to provide a config as the first positional argument."
-            )
+            raise ValueError("Please make sure to provide a config as the first positional argument.")
         # ======>
 
         if not isinstance(config, dict):
@@ -233,24 +223,27 @@ def from_config(
                 deprecation_message += (
                     f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
                     " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
-                    " be removed in v1.0.0.")
+                    " be removed in v1.0.0."
+                )
             elif "Model" in cls.__name__:
                 deprecation_message += (
                     f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
                     f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
-                    " instead. This functionality will be removed in v1.0.0.")
+                    " instead. This functionality will be removed in v1.0.0."
+                )
             deprecate(
                 "config-passed-as-path",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             config, kwargs = cls.load_config(
                 pretrained_model_name_or_path=config,
                 return_unused_kwargs=True,
-                **kwargs, )
+                **kwargs,
+            )
 
-        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config,
-                                                                      **kwargs)
+        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
 
         # Allow dtype to be specified on initialization
         if "dtype" in unused_kwargs:
@@ -259,8 +252,7 @@ def from_config(
         # add possible deprecated kwargs
         for deprecated_kwarg in cls._deprecated_kwargs:
             if deprecated_kwarg in unused_kwargs:
-                init_dict[deprecated_kwarg] = unused_kwargs.pop(
-                    deprecated_kwarg)
+                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
 
         # Return model and optionally state and/or unused_kwargs
         model = cls(**init_dict)
@@ -269,7 +261,7 @@ def from_config(
         model.register_to_config(**hidden_dict)
 
         # add hidden kwargs of compatible classes to unused_kwargs
-        unused_kwargs = { ** unused_kwargs, ** hidden_dict}
+        unused_kwargs = {**unused_kwargs, **hidden_dict}
 
         if return_unused_kwargs:
             return (model, unused_kwargs)
@@ -280,21 +272,19 @@ def from_config(
     def get_config_dict(cls, *args, **kwargs):
         deprecation_message = (
             f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
-            " removed in version v1.0.0")
-        deprecate(
-            "get_config_dict",
-            "1.0.0",
-            deprecation_message,
-            standard_warn=False)
+            " removed in version v1.0.0"
+        )
+        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
         return cls.load_config(*args, **kwargs)
 
     @classmethod
     def load_config(
-            cls,
-            pretrained_model_name_or_path: Union[str, os.PathLike],
-            return_unused_kwargs=False,
-            return_commit_hash=False,
-            **kwargs, ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        return_unused_kwargs=False,
+        return_commit_hash=False,
+        **kwargs,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         r"""
         Instantiate a Python class from a config dictionary
 
@@ -354,8 +344,9 @@ def load_config(
         </Tip>
         """
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
@@ -365,7 +356,7 @@ def load_config(
         _ = kwargs.pop("mirror", None)
         subfolder = kwargs.pop("subfolder", None)
         user_agent = kwargs.pop("user_agent", {})
-        user_agent = { ** user_agent, "file_type": "config"}
+        user_agent = {**user_agent, "file_type": "config"}
         user_agent = http_user_agent(user_agent)
         # new add return_config_file
         return_config_file = kwargs.pop("return_config_file", False)
@@ -381,17 +372,13 @@ def load_config(
         if os.path.isfile(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
         elif os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path,
-                                 cls.config_name)):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
                 # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model_name_or_path,
-                                           cls.config_name)
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
             elif subfolder is not None and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder,
-                                 cls.config_name)):
-                config_file = os.path.join(pretrained_model_name_or_path,
-                                           subfolder, cls.config_name)
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            ):
+                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
             else:
                 raise EnvironmentError(
                     f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
@@ -409,7 +396,8 @@ def load_config(
                 user_agent=user_agent,
                 subfolder=subfolder,
                 revision=revision,
-                from_hf_hub=from_hf_hub, )
+                from_hf_hub=from_hf_hub,
+            )
 
         try:
             # Load config dict
@@ -417,23 +405,20 @@ def load_config(
             commit_hash = extract_commit_hash(config_file)
 
         except (json.JSONDecodeError, UnicodeDecodeError):
-            raise EnvironmentError(
-                f"It looks like the config file at '{config_file}' is not a valid JSON file."
-            )
+            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
 
-        if not (return_unused_kwargs or return_commit_hash or
-                return_config_file):
+        if not (return_unused_kwargs or return_commit_hash or return_config_file):
             return config_dict
 
-        outputs = (config_dict, )
+        outputs = (config_dict,)
         if return_unused_kwargs:
-            outputs += (kwargs, )
+            outputs += (kwargs,)
 
         if return_commit_hash:
-            outputs += (commit_hash, )
+            outputs += (commit_hash,)
 
         if return_config_file:
-            outputs += (config_file, )
+            outputs += (config_file,)
 
         return outputs
 
@@ -462,43 +447,26 @@ def extract_init_dict(cls, config_dict, **kwargs):
         ppdiffusers_library = importlib.import_module(__name__.split(".")[0])
 
         if cls.has_compatibles:
-            compatible_classes = [
-                c for c in cls._get_compatibles()
-                if not isinstance(c, DummyObject)
-            ]
+            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
         else:
             compatible_classes = []
 
         expected_keys_comp_cls = set()
         for c in compatible_classes:
             expected_keys_c = cls._get_init_keys(c)
-            expected_keys_comp_cls = expected_keys_comp_cls.union(
-                expected_keys_c)
-        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(
-            cls)
-        config_dict = {
-            k: v
-            for k, v in config_dict.items() if k not in expected_keys_comp_cls
-        }
+            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
 
         # remove attributes from orig class that cannot be expected
         orig_cls_name = config_dict.pop("_class_name", cls.__name__)
-        if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library,
-                                                     orig_cls_name):
+        if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library, orig_cls_name):
             orig_cls = getattr(ppdiffusers_library, orig_cls_name)
-            unexpected_keys_from_orig = cls._get_init_keys(
-                orig_cls) - expected_keys
-            config_dict = {
-                k: v
-                for k, v in config_dict.items()
-                if k not in unexpected_keys_from_orig
-            }
+            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
 
         # remove private attributes
-        config_dict = {
-            k: v
-            for k, v in config_dict.items() if not k.startswith("_")
-        }
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
 
         # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
         init_dict = {}
@@ -520,7 +488,8 @@ def extract_init_dict(cls, config_dict, **kwargs):
             logger.warning(
                 f"The config attributes {config_dict} were passed to {cls.__name__}, "
                 "but are not expected and will be ignored. Please verify your "
-                f"{cls.config_name} configuration file.")
+                f"{cls.config_name} configuration file."
+            )
 
         # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
         passed_keys = set(init_dict.keys())
@@ -530,13 +499,10 @@ def extract_init_dict(cls, config_dict, **kwargs):
             )
 
         # 6. Define unused keyword arguments
-        unused_kwargs = { ** config_dict, ** kwargs}
+        unused_kwargs = {**config_dict, **kwargs}
 
         # 7. Define "hidden" config parameters that were saved for compatible classes
-        hidden_config_dict = {
-            k: v
-            for k, v in original_dict.items() if k not in init_dict
-        }
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
 
         return init_dict, unused_kwargs, hidden_config_dict
 
@@ -546,8 +512,7 @@ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
             text = reader.read()
         data = json.loads(text)
         if "_diffusers_version" in data and "_ppdiffusers_version" not in data:
-            data["_ppdiffusers_version"] = data.pop("_diffusers_version",
-                                                    __version__)
+            data["_ppdiffusers_version"] = data.pop("_diffusers_version", __version__)
         if "_diffusers_version" not in data and "_ppdiffusers_version" not in data:
             data["_ppdiffusers_version"] = __version__
 
@@ -581,8 +546,7 @@ def to_json_string(self, to_diffusers=False) -> str:
         Returns:
             `str`: String containing all the attributes that make up this configuration instance in JSON format.
         """
-        config_dict = self._internal_dict if hasattr(self,
-                                                     "_internal_dict") else {}
+        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
         config_dict["_class_name"] = self.__class__.__name__
 
         # json
@@ -609,14 +573,12 @@ def to_json_saveable(value):
         config_dict.pop("_ignore_files", None)
         json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
         if to_diffusers:
-            json_string = json_string.replace(
-                '"ppdiffusers"', '"diffusers"').replace(
-                    '"paddlenlp.transformers"', '"transformers"')
+            json_string = json_string.replace('"ppdiffusers"', '"diffusers"').replace(
+                '"paddlenlp.transformers"', '"transformers"'
+            )
         return json_string
 
-    def to_json_file(self,
-                     json_file_path: Union[str, os.PathLike],
-                     to_diffusers=False):
+    def to_json_file(self, json_file_path: Union[str, os.PathLike], to_diffusers=False):
         """
         Save this instance to a JSON file.
 
@@ -641,41 +603,39 @@ def register_to_config(init):
     def inner_init(self, *args, **kwargs):
         # Ignore private kwargs in the init.
         init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
-        config_init_kwargs = {
-            k: v
-            for k, v in kwargs.items() if k.startswith("_")
-        }
+        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
         if not isinstance(self, ConfigMixin):
             raise RuntimeError(
                 f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
-                "not inherit from `ConfigMixin`.")
+                "not inherit from `ConfigMixin`."
+            )
 
         ignore = getattr(self, "ignore_for_config", [])
         # Get positional arguments aligned with kwargs
         new_kwargs = {}
         signature = inspect.signature(init)
         parameters = {
-            name: p.default
-            for i, (name, p) in enumerate(signature.parameters.items())
-            if i > 0 and name not in ignore
+            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
         }
         for arg, name in zip(args, parameters.keys()):
             new_kwargs[name] = arg
 
         # Then add all kwargs
-        new_kwargs.update({
-            k: init_kwargs.get(k, default)
-            for k, default in parameters.items()
-            if k not in ignore and k not in new_kwargs
-        })
-        new_kwargs = { ** config_init_kwargs, ** new_kwargs}
+        new_kwargs.update(
+            {
+                k: init_kwargs.get(k, default)
+                for k, default in parameters.items()
+                if k not in ignore and k not in new_kwargs
+            }
+        )
+        new_kwargs = {**config_init_kwargs, **new_kwargs}
         getattr(self, "register_to_config")(**new_kwargs)
         init(self, *args, **init_kwargs)
 
     return inner_init
 
 
-def finfo(dtype: paddle.dtype=None):
+def finfo(dtype: paddle.dtype = None):
     if dtype is None:
         dtype = paddle.get_default_dtype()
 
@@ -699,10 +659,11 @@ class ModuleUtilsMixin:
     """
 
     def get_extended_attention_mask(
-            self,
-            attention_mask: paddle.Tensor,
-            input_shape: Tuple[int],
-            dtype: paddle.float32=None, ) -> paddle.Tensor:
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        dtype: paddle.float32 = None,
+    ) -> paddle.Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
         Arguments:
@@ -725,14 +686,15 @@ def get_extended_attention_mask(
             extended_attention_mask = attention_mask[:, None, None, :]
         else:
             raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".
-                format(input_shape, attention_mask.shape))
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = (
-            1.0 - extended_attention_mask) * finfo(dtype).min
+        extended_attention_mask = (1.0 - extended_attention_mask) * finfo(dtype).min
         return extended_attention_mask
diff --git a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
index 81cca5941a71a..730f5b91dba6c 100644
--- a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
+++ b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
@@ -40,11 +40,12 @@ class ValueGuidedRLPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            value_function: UNet1DModel,
-            unet: UNet1DModel,
-            scheduler: DDPMScheduler,
-            env, ):
+        self,
+        value_function: UNet1DModel,
+        unet: UNet1DModel,
+        scheduler: DDPMScheduler,
+        env,
+    ):
         super().__init__()
         self.value_function = value_function
         self.unet = unet
@@ -89,14 +90,13 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale):
         y = None
         for i in self.progress_bar(self.scheduler.timesteps):
             # create batch of timesteps to pass into model
-            timesteps = paddle.full((batch_size, ), i, dtype=paddle.int64)
+            timesteps = paddle.full((batch_size,), i, dtype=paddle.int64)
             for _ in range(n_guide_steps):
                 with paddle.set_grad_enabled(True):
                     x.stop_gradient = False
 
                     # permute to match dimension for pre-trained models
-                    y = self.value_function(x.transpose([0, 2, 1]),
-                                            timesteps).sample
+                    y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample
                     grad = paddle.autograd.grad([y.sum()], [x])[0]
 
                     posterior_variance = self.scheduler._get_variance(i)
@@ -108,24 +108,17 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale):
                 x = x + scale * grad
                 x = self.reset_x0(x, conditions, self.action_dim)
 
-            prev_x = self.unet(x.transpose([0, 2, 1]),
-                               timesteps).sample.transpose([0, 2, 1])
+            prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1])
 
             # TODO: verify deprecation of this kwarg
-            x = self.scheduler.step(
-                prev_x, i, x, predict_epsilon=False)["prev_sample"]
+            x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
 
             # apply conditions to the trajectory (set the initial state)
             x = self.reset_x0(x, conditions, self.action_dim)
             x = self.to_paddle(x)
         return x, y
 
-    def __call__(self,
-                 obs,
-                 batch_size=64,
-                 planning_horizon=32,
-                 n_guide_steps=2,
-                 scale=0.1):
+    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
         # normalize the observations and create  batch dimension
         obs = self.normalize(obs, "observations")
         obs = obs[None].repeat(batch_size, axis=0)
@@ -144,7 +137,7 @@ def __call__(self,
         # sort output trajectories by value
         sorted_idx = paddle.argsort(y, 0, descending=True).squeeze()
         sorted_values = x[sorted_idx]
-        actions = sorted_values[:, :, :self.action_dim]
+        actions = sorted_values[:, :, : self.action_dim]
         actions = actions.detach().cpu().numpy()
         denorm_actions = self.de_normalize(actions, key="actions")
 
diff --git a/ppdiffusers/ppdiffusers/image_processor.py b/ppdiffusers/ppdiffusers/image_processor.py
index 3e52c14b439c4..82f9dd5f2c682 100644
--- a/ppdiffusers/ppdiffusers/image_processor.py
+++ b/ppdiffusers/ppdiffusers/image_processor.py
@@ -48,12 +48,13 @@ class VaeImageProcessor(ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            do_resize: bool=True,
-            vae_scale_factor: int=8,
-            resample: str="lanczos",
-            do_normalize: bool=True,
-            do_convert_rgb: bool=False, ):
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        do_convert_rgb: bool = False,
+    ):
         super().__init__()
 
     @staticmethod
@@ -66,26 +67,20 @@ def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
         images = (images * 255).round().astype("uint8")
         if images.shape[-1] == 1:
             # special case for grayscale (single channel) images
-            pil_images = [
-                Image.fromarray(
-                    image.squeeze(), mode="L") for image in images
-            ]
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
         else:
             pil_images = [Image.fromarray(image) for image in images]
 
         return pil_images
 
     @staticmethod
-    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]
-                     ) -> np.ndarray:
+    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
         """
         Convert a PIL image or a list of PIL images to numpy arrays.
         """
         if not isinstance(images, list):
             images = [images]
-        images = [
-            np.array(image).astype(np.float32) / 255.0 for image in images
-        ]
+        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
         images = np.stack(images, axis=0)
 
         return images
@@ -132,10 +127,11 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
         return image
 
     def resize(
-            self,
-            image: PIL.Image.Image,
-            height: Optional[int]=None,
-            width: Optional[int]=None, ) -> PIL.Image.Image:
+        self,
+        image: PIL.Image.Image,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> PIL.Image.Image:
         """
         Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor`
         """
@@ -144,20 +140,18 @@ def resize(
         if width is None:
             width = image.width
 
-        width, height = (x - x % self.config.vae_scale_factor
-                         for x in (width, height)
-                         )  # resize to integer multiple of vae_scale_factor
-        image = image.resize(
-            (width, height), resample=PIL_INTERPOLATION[self.config.resample])
+        width, height = (
+            x - x % self.config.vae_scale_factor for x in (width, height)
+        )  # resize to integer multiple of vae_scale_factor
+        image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
         return image
 
     def preprocess(
-            self,
-            image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            do_normalize: Optional[
-                bool]=None,  # new added, not exists in diffusers
+        self,
+        image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        do_normalize: Optional[bool] = None,  # new added, not exists in diffusers
     ) -> paddle.Tensor:
         """
         Preprocess the image input, accepted formats are PIL images, numpy arrays or paddle tensors"
@@ -165,8 +159,7 @@ def preprocess(
         supported_formats = (PIL.Image.Image, np.ndarray, paddle.Tensor)
         if isinstance(image, supported_formats):
             image = [image]
-        elif not (isinstance(image, list) and
-                  all(isinstance(i, supported_formats) for i in image)):
+        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
             raise ValueError(
                 f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
             )
@@ -180,23 +173,19 @@ def preprocess(
             image = self.numpy_to_pd(image)  # to pd
 
         elif isinstance(image[0], np.ndarray):
-            image = (np.concatenate(
-                image, axis=0) if image[0].ndim == 4 else np.stack(
-                    image, axis=0))
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             image = self.numpy_to_pd(image)
             _, _, height, width = image.shape
             if self.config.do_resize and (
-                    height % self.config.vae_scale_factor != 0 or
-                    width % self.config.vae_scale_factor != 0):
+                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+            ):
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}"
                     f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
                 )
 
         elif isinstance(image[0], paddle.Tensor):
-            image = (paddle.concat(
-                image, axis=0) if image[0].ndim == 4 else paddle.stack(
-                    image, axis=0))
+            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
             _, channel, height, width = image.shape
 
             # don't need any preprocess if the image is latents
@@ -204,21 +193,21 @@ def preprocess(
                 return image
 
             if self.config.do_resize and (
-                    height % self.config.vae_scale_factor != 0 or
-                    width % self.config.vae_scale_factor != 0):
+                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+            ):
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your paddle tensor to be divisible by {self.config.vae_scale_factor}"
                     f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
                 )
 
         # expected range [0,1], normalize to [-1,1]
-        do_normalize = (self.config.do_normalize
-                        if do_normalize is None else do_normalize)
+        do_normalize = self.config.do_normalize if do_normalize is None else do_normalize
         if image.min() < 0:
             warnings.warn(
                 "Passing `image` as paddle tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
                 f"when passing as paddle tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
-                FutureWarning, )
+                FutureWarning,
+            )
             do_normalize = False
 
         if do_normalize:
@@ -227,10 +216,11 @@ def preprocess(
         return image
 
     def postprocess(
-            self,
-            image: paddle.Tensor,
-            output_type: str="pil",
-            do_denormalize: Optional[List[bool]]=None, ):
+        self,
+        image: paddle.Tensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ):
         if not isinstance(image, paddle.Tensor):
             raise ValueError(
                 f"Input for postprocessing is in incorrect format: {type(image)}. We only support paddle tensor"
@@ -238,12 +228,14 @@ def postprocess(
         if output_type not in ["latent", "pd", "np", "pil"]:
             deprecation_message = (
                 f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pd`, `latent`")
+                "`pil`, `np`, `pd`, `latent`"
+            )
             deprecate(
                 "Unsupported output_type",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             output_type = "np"
 
         if output_type == "latent":
@@ -252,10 +244,9 @@ def postprocess(
         if do_denormalize is None:
             do_denormalize = [self.config.do_normalize] * image.shape[0]
 
-        image = paddle.stack([
-            self.denormalize(image[i]) if do_denormalize[i] else image[i]
-            for i in range(image.shape[0])
-        ])
+        image = paddle.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
 
         if output_type == "pd":
             return image
diff --git a/ppdiffusers/ppdiffusers/loaders.py b/ppdiffusers/ppdiffusers/loaders.py
index 934518d67b9d6..da64eb0e6ec9d 100644
--- a/ppdiffusers/ppdiffusers/loaders.py
+++ b/ppdiffusers/ppdiffusers/loaders.py
@@ -24,16 +24,31 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
 
-from .models.attention_processor import (CustomDiffusionAttnProcessor,
-                                         CustomDiffusionXFormersAttnProcessor,
-                                         LoRAAttnProcessor)
+from .models.attention_processor import (
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    LoRAAttnProcessor,
+)
 from .models.modeling_utils import convert_state_dict
-from .utils import (DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB,
-                    HF_HUB_OFFLINE, PPDIFFUSERS_CACHE, TEXT_ENCODER_ATTN_MODULE,
-                    TO_DIFFUSERS, _get_model_file, is_paddlenlp_available,
-                    is_safetensors_available, is_torch_available, is_torch_file,
-                    logging, ppdiffusers_url_download, safetensors_load,
-                    smart_load, torch_load)
+from .utils import (
+    DIFFUSERS_CACHE,
+    FROM_DIFFUSERS,
+    FROM_HF_HUB,
+    HF_HUB_OFFLINE,
+    PPDIFFUSERS_CACHE,
+    TEXT_ENCODER_ATTN_MODULE,
+    TO_DIFFUSERS,
+    _get_model_file,
+    is_paddlenlp_available,
+    is_safetensors_available,
+    is_torch_available,
+    is_torch_file,
+    logging,
+    ppdiffusers_url_download,
+    safetensors_load,
+    smart_load,
+    torch_load,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -68,11 +83,9 @@ def transpose_state_dict(state_dict, name_mapping=None):
             for old_name, new_name in name_mapping.items():
                 k = k.replace(old_name, new_name)
         if v.ndim == 2:
-            new_state_dict[k] = v.T.contiguous() if hasattr(
-                v, "contiguous") else v.T
+            new_state_dict[k] = v.T.contiguous() if hasattr(v, "contiguous") else v.T
         else:
-            new_state_dict[k] = v.contiguous() if hasattr(v,
-                                                          "contiguous") else v
+            new_state_dict[k] = v.contiguous() if hasattr(v, "contiguous") else v
     return new_state_dict
 
 
@@ -110,8 +123,7 @@ def map_from(module, state_dict, *args, **kwargs):
             all_keys = list(state_dict.keys())
             for key in all_keys:
                 replace_key = remap_key(key, state_dict)
-                new_key = key.replace(
-                    replace_key, f"layers.{module.rev_mapping[replace_key]}")
+                new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
                 state_dict[new_key] = state_dict[key]
                 del state_dict[key]
 
@@ -124,10 +136,10 @@ class UNet2DConditionLoadersMixin:
     unet_name = UNET_NAME
 
     def load_attn_procs(
-            self,
-            pretrained_model_name_or_path_or_dict: Union[str, Dict[
-                str, paddle.Tensor]],
-            **kwargs, ):
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]],
+        **kwargs,
+    ):
         r"""
         Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be
         defined in
@@ -186,8 +198,9 @@ def load_attn_procs(
         </Tip>
         """
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
@@ -202,8 +215,7 @@ def load_attn_procs(
         # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
         network_alpha = kwargs.pop("network_alpha", None)
 
-        if from_diffusers and use_safetensors and not is_safetensors_available(
-        ):
+        if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
                 "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
             )
@@ -221,13 +233,12 @@ def load_attn_procs(
             if from_diffusers:
                 # Let's first try to load .safetensors weights
                 if (use_safetensors and weight_name is None) or (
-                        weight_name is not None and
-                        weight_name.endswith(".safetensors")):
+                    weight_name is not None and weight_name.endswith(".safetensors")
+                ):
                     try:
                         model_file = _get_model_file(
                             pretrained_model_name_or_path_or_dict,
-                            weights_name=weight_name or
-                            TORCH_LORA_WEIGHT_NAME_SAFE,
+                            weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
                             cache_dir=cache_dir,
                             force_download=force_download,
                             resume_download=resume_download,
@@ -237,7 +248,8 @@ def load_attn_procs(
                             revision=revision,
                             subfolder=subfolder,
                             user_agent=user_agent,
-                            from_hf_hub=from_hf_hub, )
+                            from_hf_hub=from_hf_hub,
+                        )
                         state_dict = smart_load(model_file)
                     except Exception:
                         model_file = None
@@ -255,7 +267,8 @@ def load_attn_procs(
                         revision=revision,
                         subfolder=subfolder,
                         user_agent=user_agent,
-                        from_hf_hub=from_hf_hub, )
+                        from_hf_hub=from_hf_hub,
+                    )
                     state_dict = smart_load(model_file)
             else:
                 model_file = _get_model_file(
@@ -270,7 +283,8 @@ def load_attn_procs(
                     revision=revision,
                     subfolder=subfolder,
                     user_agent=user_agent,
-                    from_hf_hub=from_hf_hub, )
+                    from_hf_hub=from_hf_hub,
+                )
                 state_dict = smart_load(model_file)
         else:
             state_dict = pretrained_model_name_or_path_or_dict
@@ -279,53 +293,42 @@ def load_attn_procs(
         attn_processors = {}
 
         is_lora = all("lora" in k for k in state_dict.keys())
-        is_custom_diffusion = any("custom_diffusion" in k
-                                  for k in state_dict.keys())
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
 
         if from_diffusers or is_torch_file(model_file):
             state_dict = transpose_state_dict(state_dict)
 
         if is_lora:
             is_new_lora_format = all(
-                key.startswith(self.unet_name) or
-                key.startswith(self.text_encoder_name)
-                for key in state_dict.keys())
+                key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+            )
             if is_new_lora_format:
                 # Strip the `"unet"` prefix.
-                is_text_encoder_present = any(
-                    key.startswith(self.text_encoder_name)
-                    for key in state_dict.keys())
+                is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
                 if is_text_encoder_present:
                     warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
                     warnings.warn(warn_message)
-                unet_keys = [
-                    k for k in state_dict.keys() if k.startswith(self.unet_name)
-                ]
-                state_dict = {
-                    k.replace(f"{self.unet_name}.", ""): v
-                    for k, v in state_dict.items() if k in unet_keys
-                }
+                unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+                state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
 
             lora_grouped_dict = defaultdict(dict)
             for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(
-                    ".")[:-3]), ".".join(key.split(".")[-3:])
+                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
                 lora_grouped_dict[attn_processor_key][sub_key] = value.cast(
-                    dtype="float32")  # we must cast this to float32
+                    dtype="float32"
+                )  # we must cast this to float32
 
             for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[
-                    1]  # 0 -> 1, torch vs paddle nn.Linear
-                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[
-                    0]  # 1 -> 0, torch vs paddle nn.Linear
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[
-                    1]  # 0 -> 1, torch vs paddle nn.Linear
+                rank = value_dict["to_k_lora.down.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
+                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]  # 1 -> 0, torch vs paddle nn.Linear
+                hidden_size = value_dict["to_k_lora.up.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
 
                 attn_processors[key] = LoRAAttnProcessor(
                     hidden_size=hidden_size,
                     cross_attention_dim=cross_attention_dim,
                     rank=rank,
-                    network_alpha=network_alpha, )
+                    network_alpha=network_alpha,
+                )
                 attn_processors[key].load_dict(value_dict)
         elif is_custom_diffusion:
             custom_diffusion_grouped_dict = defaultdict(dict)
@@ -334,16 +337,12 @@ def load_attn_procs(
                     custom_diffusion_grouped_dict[key] = {}
                 else:
                     if "to_out" in key:
-                        attn_processor_key, sub_key = ".".join(
-                            key.split(".")[:-3]), ".".join(
-                                key.split(".")[-3:])
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
                     else:
-                        attn_processor_key, sub_key = ".".join(
-                            key.split(".")[:-2]), ".".join(
-                                key.split(".")[-2:])
-                    custom_diffusion_grouped_dict[attn_processor_key][
-                        sub_key] = value.cast(
-                            dtype="float32")  # we must cast this to float32
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value.cast(
+                        dtype="float32"
+                    )  # we must cast this to float32
 
             for key, value_dict in custom_diffusion_grouped_dict.items():
                 if len(value_dict) == 0:
@@ -351,44 +350,42 @@ def load_attn_procs(
                         train_kv=False,
                         train_q_out=False,
                         hidden_size=None,
-                        cross_attention_dim=None, )
+                        cross_attention_dim=None,
+                    )
                 else:
-                    cross_attention_dim = value_dict[
-                        "to_k_custom_diffusion.weight"].shape[
-                            0]  # 1 -> 0, torch vs paddle nn.Linear
-                    hidden_size = value_dict[
-                        "to_k_custom_diffusion.weight"].shape[
-                            1]  # 0 -> 1, torch vs paddle nn.Linear
-                    train_q_out = (True if
-                                   "to_q_custom_diffusion.weight" in value_dict
-                                   else False)
+                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[
+                        0
+                    ]  # 1 -> 0, torch vs paddle nn.Linear
+                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[
+                        1
+                    ]  # 0 -> 1, torch vs paddle nn.Linear
+                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
                     attn_processors[key] = CustomDiffusionAttnProcessor(
                         train_kv=True,
                         train_q_out=train_q_out,
                         hidden_size=hidden_size,
-                        cross_attention_dim=cross_attention_dim, )
+                        cross_attention_dim=cross_attention_dim,
+                    )
                     attn_processors[key].load_dict(value_dict)
         else:
             raise ValueError(
                 f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
             )
         # set correct dtype & device
-        attn_processors = {
-            k: v.to(dtype=self.dtype)
-            for k, v in attn_processors.items()
-        }
+        attn_processors = {k: v.to(dtype=self.dtype) for k, v in attn_processors.items()}
 
         # set layers
         self.set_attn_processor(attn_processors)
 
     def save_attn_procs(
-            self,
-            save_directory: Union[str, os.PathLike],
-            is_main_process: bool=True,
-            weight_name: str=None,
-            save_function: Callable=None,
-            safe_serialization: bool=False,
-            to_diffusers: Optional[bool]=None, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+        to_diffusers: Optional[bool] = None,
+    ):
         r"""
         Save an attention processor to a directory, so that it can be re-loaded using the
         `[`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`]` method.
@@ -413,34 +410,33 @@ def save_attn_procs(
         """
         if to_diffusers is None:
             to_diffusers = TO_DIFFUSERS
-        if to_diffusers and safe_serialization and not is_safetensors_available(
-        ):
-            raise ImportError(
-                "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
-            )
+        if to_diffusers and safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
 
         if os.path.isfile(save_directory):
-            logger.error(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
 
         os.makedirs(save_directory, exist_ok=True)
 
         is_custom_diffusion = any(
-            isinstance(x, (CustomDiffusionAttnProcessor,
-                           CustomDiffusionXFormersAttnProcessor))
-            for (_, x) in self.attn_processors.items())
+            isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+            for (_, x) in self.attn_processors.items()
+        )
         if is_custom_diffusion:
-            model_to_save = AttnProcsLayers({
-                y: x
-                for (y, x) in self.attn_processors.items()
-                if isinstance(
-                    x,
-                    (
-                        CustomDiffusionAttnProcessor,
-                        CustomDiffusionXFormersAttnProcessor, ), )
-            })
+            model_to_save = AttnProcsLayers(
+                {
+                    y: x
+                    for (y, x) in self.attn_processors.items()
+                    if isinstance(
+                        x,
+                        (
+                            CustomDiffusionAttnProcessor,
+                            CustomDiffusionXFormersAttnProcessor,
+                        ),
+                    )
+                }
+            )
             state_dict = model_to_save.state_dict()
             for name, attn in self.attn_processors.items():
                 if len(attn.state_dict()) == 0:
@@ -452,16 +448,13 @@ def save_attn_procs(
         if weight_name is None:
             if to_diffusers:
                 if safe_serialization:
-                    weight_name = (TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE
-                                   if is_custom_diffusion else
-                                   TORCH_LORA_WEIGHT_NAME_SAFE)
+                    weight_name = (
+                        TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME_SAFE
+                    )
                 else:
-                    weight_name = (TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME
-                                   if is_custom_diffusion else
-                                   TORCH_LORA_WEIGHT_NAME)
+                    weight_name = TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME
             else:
-                weight_name = (PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if
-                               is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME)
+                weight_name = PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME
 
         # choose save_function
         if save_function is None:
@@ -469,16 +462,13 @@ def save_attn_procs(
                 if safe_serialization:
                     if is_torch_available():
                         _save_function = safetensors.torch.save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="torch")
+                        state_dict = convert_state_dict(state_dict, framework="torch")
                     else:
                         _save_function = safetensors.numpy.save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="numpy")
+                        state_dict = convert_state_dict(state_dict, framework="numpy")
 
                     def save_function(weights, filename):
-                        return _save_function(
-                            weights, filename, metadata={"format": "pt"})
+                        return _save_function(weights, filename, metadata={"format": "pt"})
 
                 else:
                     if not is_torch_available():
@@ -486,8 +476,7 @@ def save_function(weights, filename):
                             "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
                         )
                     save_function = torch.save
-                    state_dict = convert_state_dict(
-                        state_dict, framework="torch")
+                    state_dict = convert_state_dict(state_dict, framework="torch")
                 state_dict = transpose_state_dict(state_dict)
             else:
                 save_function = paddle.save
@@ -495,9 +484,7 @@ def save_function(weights, filename):
         # Save the model
         save_function(state_dict, os.path.join(save_directory, weight_name))
 
-        logger.info(
-            f"Model weights saved in {os.path.join(save_directory, weight_name)}"
-        )
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
 
 
 class TextualInversionLoaderMixin:
@@ -505,9 +492,7 @@ class TextualInversionLoaderMixin:
     Mixin class for loading textual inversion tokens and embeddings to the tokenizer and text encoder.
     """
 
-    def maybe_convert_prompt(self,
-                             prompt: Union[str, List[str]],
-                             tokenizer: "PretrainedTokenizer"):
+    def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PretrainedTokenizer"):
         r"""
         Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
         to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
@@ -533,9 +518,7 @@ def maybe_convert_prompt(self,
 
         return prompts
 
-    def _maybe_convert_prompt(self,
-                              prompt: str,
-                              tokenizer: "PretrainedTokenizer"):
+    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"):
         r"""
         Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
         to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
@@ -563,10 +546,11 @@ def _maybe_convert_prompt(self,
         return prompt
 
     def load_textual_inversion(
-            self,
-            pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]],
-            token: Optional[str]=None,
-            **kwargs, ):
+        self,
+        pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]],
+        token: Optional[str] = None,
+        **kwargs,
+    ):
         r"""
         Load textual inversion embeddings into the text encoder of stable diffusion pipelines. Both `diffusers` and
         `Automatic1111` formats are supported (see example below).
@@ -643,20 +627,21 @@ def load_textual_inversion(
         image.save("character.png")
         ```
         """
-        if not hasattr(self, "tokenizer") or not isinstance(
-                self.tokenizer, PretrainedTokenizer):
+        if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PretrainedTokenizer):
             raise ValueError(
                 f"{self.__class__.__name__} requires `self.tokenizer` of type `PretrainedTokenizer` for calling"
-                f" `{self.load_textual_inversion.__name__}`")
+                f" `{self.load_textual_inversion.__name__}`"
+            )
 
-        if not hasattr(self, "text_encoder") or not isinstance(
-                self.text_encoder, PretrainedModel):
+        if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder, PretrainedModel):
             raise ValueError(
                 f"{self.__class__.__name__} requires `self.text_encoder` of type `PretrainedModel` for calling"
-                f" `{self.load_textual_inversion.__name__}`")
+                f" `{self.load_textual_inversion.__name__}`"
+            )
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -668,8 +653,7 @@ def load_textual_inversion(
         weight_name = kwargs.pop("weight_name", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
 
-        if from_diffusers and use_safetensors and not is_safetensors_available(
-        ):
+        if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
                 "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
             )
@@ -685,13 +669,12 @@ def load_textual_inversion(
         # Let's first try to load .safetensors weights
         if from_diffusers:
             if (use_safetensors and weight_name is None) or (
-                    weight_name is not None and
-                    weight_name.endswith(".safetensors")):
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
                 try:
                     model_file = _get_model_file(
                         pretrained_model_name_or_path,
-                        weights_name=weight_name or
-                        TORCH_TEXT_INVERSION_NAME_SAFE,
+                        weights_name=weight_name or TORCH_TEXT_INVERSION_NAME_SAFE,
                         cache_dir=cache_dir,
                         force_download=force_download,
                         resume_download=resume_download,
@@ -701,7 +684,8 @@ def load_textual_inversion(
                         revision=revision,
                         subfolder=subfolder,
                         user_agent=user_agent,
-                        from_hf_hub=from_hf_hub, )
+                        from_hf_hub=from_hf_hub,
+                    )
                     state_dict = safetensors_load(model_file)
                 except Exception:
                     model_file = None
@@ -719,7 +703,8 @@ def load_textual_inversion(
                     revision=revision,
                     subfolder=subfolder,
                     user_agent=user_agent,
-                    from_hf_hub=from_hf_hub, )
+                    from_hf_hub=from_hf_hub,
+                )
                 state_dict = torch_load(model_file)
         else:
             model_file = _get_model_file(
@@ -734,7 +719,8 @@ def load_textual_inversion(
                 revision=revision,
                 subfolder=subfolder,
                 user_agent=user_agent,
-                from_hf_hub=from_hf_hub, )
+                from_hf_hub=from_hf_hub,
+            )
             if is_torch_file(model_file):
                 try:
                     state_dict = safetensors_load(model_file)
@@ -759,9 +745,7 @@ def load_textual_inversion(
             embedding = state_dict["string_to_param"]["*"]
 
         if token is not None and loaded_token != token:
-            logger.warn(
-                f"The loaded token: {loaded_token} is overwritten by the passed token {token}."
-            )
+            logger.warn(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
         else:
             token = loaded_token
 
@@ -795,14 +779,11 @@ def load_textual_inversion(
         is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
 
         if is_multi_vector:
-            tokens = [token] + [
-                f"{token}_{i}" for i in range(1, embedding.shape[0])
-            ]
+            tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
             embeddings = [e for e in embedding]  # noqa: C416
         else:
             tokens = [token]
-            embeddings = [embedding[0]] if len(
-                embedding.shape) > 1 else [embedding]
+            embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
 
         # add tokens and get ids
         self.tokenizer.add_tokens(tokens)
@@ -812,8 +793,7 @@ def load_textual_inversion(
         self.text_encoder.resize_token_embeddings(len(self.tokenizer))
         with paddle.no_grad():
             for token_id, embedding in zip(token_ids, embeddings):
-                self.text_encoder.get_input_embeddings().weight[
-                    token_id] = embedding
+                self.text_encoder.get_input_embeddings().weight[token_id] = embedding
 
         logger.info(f"Loaded textual inversion embedding for {token}.")
 
@@ -830,10 +810,10 @@ class LoraLoaderMixin:
     unet_name = UNET_NAME
 
     def load_lora_weights(
-            self,
-            pretrained_model_name_or_path_or_dict: Union[str, Dict[
-                str, paddle.Tensor]],
-            **kwargs, ):
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]],
+        **kwargs,
+    ):
         r"""
         Load pretrained attention processor layers (such as LoRA) into [`UNet2DConditionModel`] and
         [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)).
@@ -885,8 +865,9 @@ def load_lora_weights(
         # Load the main state dict first which has the LoRA layers for either of
         # UNet and text encoder or both.
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -901,8 +882,7 @@ def load_lora_weights(
         # set lora scale to a reasonable default
         self._lora_scale = 1.0
 
-        if from_diffusers and use_safetensors and not is_safetensors_available(
-        ):
+        if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
                 "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
             )
@@ -920,13 +900,12 @@ def load_lora_weights(
             if from_diffusers:
                 # Let's first try to load .safetensors weights
                 if (use_safetensors and weight_name is None) or (
-                        weight_name is not None and
-                        weight_name.endswith(".safetensors")):
+                    weight_name is not None and weight_name.endswith(".safetensors")
+                ):
                     try:
                         model_file = _get_model_file(
                             pretrained_model_name_or_path_or_dict,
-                            weights_name=weight_name or
-                            TORCH_LORA_WEIGHT_NAME_SAFE,
+                            weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
                             cache_dir=cache_dir,
                             force_download=force_download,
                             resume_download=resume_download,
@@ -936,7 +915,8 @@ def load_lora_weights(
                             revision=revision,
                             subfolder=subfolder,
                             user_agent=user_agent,
-                            from_hf_hub=from_hf_hub, )
+                            from_hf_hub=from_hf_hub,
+                        )
                         state_dict = smart_load(model_file)
                     except Exception:
                         model_file = None
@@ -954,7 +934,8 @@ def load_lora_weights(
                         revision=revision,
                         subfolder=subfolder,
                         user_agent=user_agent,
-                        from_hf_hub=from_hf_hub, )
+                        from_hf_hub=from_hf_hub,
+                    )
                     state_dict = smart_load(model_file)
             else:
                 model_file = _get_model_file(
@@ -969,7 +950,8 @@ def load_lora_weights(
                     revision=revision,
                     subfolder=subfolder,
                     user_agent=user_agent,
-                    from_hf_hub=from_hf_hub, )
+                    from_hf_hub=from_hf_hub,
+                )
                 state_dict = smart_load(model_file)
         else:
             state_dict = pretrained_model_name_or_path_or_dict
@@ -979,45 +961,39 @@ def load_lora_weights(
 
         # Convert kohya-ss Style LoRA attn procs to ppdiffusers attn procs
         network_alpha = None
-        if all((k.startswith("lora_te_") or k.startswith("lora_unet_"))
-               for k in state_dict.keys()):
-            state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(
-                state_dict)
+        if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()):
+            state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(state_dict)
             from_diffusers = True
 
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
         # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
         # their prefixes.
         keys = list(state_dict.keys())
-        if all(
-                key.startswith(self.unet_name) or
-                key.startswith(self.text_encoder_name) for key in keys):
+        if all(key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in keys):
             # Load the layers corresponding to UNet.
             unet_keys = [k for k in keys if k.startswith(self.unet_name)]
             logger.info(f"Loading {self.unet_name}.")
             unet_lora_state_dict = {
-                k.replace(f"{self.unet_name}.", ""): v
-                for k, v in state_dict.items() if k in unet_keys
+                k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys
             }
             self.unet.load_attn_procs(
                 unet_lora_state_dict,
                 network_alpha=network_alpha,
-                from_diffusers=from_diffusers, )
+                from_diffusers=from_diffusers,
+            )
 
             # Load the layers corresponding to text encoder and make necessary adjustments.
-            text_encoder_keys = [
-                k for k in keys if k.startswith(self.text_encoder_name)
-            ]
+            text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)]
             text_encoder_lora_state_dict = {
-                k.replace(f"{self.text_encoder_name}.", ""): v
-                for k, v in state_dict.items() if k in text_encoder_keys
+                k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
             }
             if len(text_encoder_lora_state_dict) > 0:
                 logger.info(f"Loading {self.text_encoder_name}.")
                 attn_procs_text_encoder = self._load_text_encoder_attn_procs(
                     text_encoder_lora_state_dict,
                     network_alpha=network_alpha,
-                    from_diffusers=from_diffusers, )
+                    from_diffusers=from_diffusers,
+                )
                 self._modify_text_encoder(attn_procs_text_encoder)
 
                 # save lora attn procs of text encoder so that it can be easily retrieved
@@ -1026,13 +1002,9 @@ def load_lora_weights(
         # Otherwise, we're dealing with the old format. This means the `state_dict` should only
         # contain the module names of the `unet` as its keys WITHOUT any prefix.
         elif not all(
-                key.startswith(self.unet_name) or
-                key.startswith(self.text_encoder_name)
-                for key in state_dict.keys()):
-            self.unet.load_attn_procs(
-                state_dict,
-                network_alpha=network_alpha,
-                from_diffusers=from_diffusers)
+            key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+        ):
+            self.unet.load_attn_procs(state_dict, network_alpha=network_alpha, from_diffusers=from_diffusers)
             warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
             warnings.warn(warn_message)
 
@@ -1050,15 +1022,13 @@ def text_encoder_lora_attn_procs(self):
 
     def _remove_text_encoder_monkey_patch(self):
         # Loop over the nn.MultiHeadAttention module of text_encoder
-        for name, attn_module in self.text_encoder.named_sublayers(
-                include_self=True):
+        for name, attn_module in self.text_encoder.named_sublayers(include_self=True):
             if name.endswith(TEXT_ENCODER_ATTN_MODULE):
                 # Loop over the LoRA layers
                 for (
-                        _,
-                        text_encoder_attr,
-                ) in self._lora_attn_processor_attr_to_text_encoder_attr.items(
-                ):
+                    _,
+                    text_encoder_attr,
+                ) in self._lora_attn_processor_attr_to_text_encoder_attr.items():
                     # Retrieve the q/k/v/out projection of nn.MultiHeadAttention
                     module = attn_module.get_sublayer(text_encoder_attr)
                     if hasattr(module, "old_forward"):
@@ -1071,8 +1041,7 @@ def _remove_text_encoder_monkey_patch(self):
                     # del processor
                     delattr(attn_module, "processor")
 
-    def _modify_text_encoder(self,
-                             attn_processors: Dict[str, LoRAAttnProcessor]):
+    def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
         r"""
         Monkey-patches the forward passes of attention modules of the text encoder.
 
@@ -1085,19 +1054,16 @@ def _modify_text_encoder(self,
         self._remove_text_encoder_monkey_patch()
 
         # Loop over the nn.MultiHeadAttention module of text_encoder
-        for name, attn_module in self.text_encoder.named_sublayers(
-                include_self=True):
+        for name, attn_module in self.text_encoder.named_sublayers(include_self=True):
             if name.endswith(TEXT_ENCODER_ATTN_MODULE):
                 # Loop over the LoRA layers
                 for (
-                        attn_proc_attr,
-                        text_encoder_attr,
-                ) in self._lora_attn_processor_attr_to_text_encoder_attr.items(
-                ):
+                    attn_proc_attr,
+                    text_encoder_attr,
+                ) in self._lora_attn_processor_attr_to_text_encoder_attr.items():
                     # Retrieve the q/k/v/out projection of nn.MultiHeadAttention and its corresponding LoRA layer.
                     module = attn_module.get_sublayer(text_encoder_attr)
-                    lora_layer = attn_processors[name].get_sublayer(
-                        attn_proc_attr)
+                    lora_layer = attn_processors[name].get_sublayer(attn_proc_attr)
                     # save old_forward to module that can be used to remove monkey-patch
                     old_forward = module.old_forward = module.forward
 
@@ -1105,8 +1071,7 @@ def _modify_text_encoder(self,
                     # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060
                     def make_new_forward(old_forward, lora_layer):
                         def new_forward(x):
-                            result = old_forward(
-                                x) + self.lora_scale * lora_layer(x)
+                            result = old_forward(x) + self.lora_scale * lora_layer(x)
                             return result
 
                         return new_forward
@@ -1127,10 +1092,10 @@ def _lora_attn_processor_attr_to_text_encoder_attr(self):
         }
 
     def _load_text_encoder_attn_procs(
-            self,
-            pretrained_model_name_or_path_or_dict: Union[str, Dict[
-                str, paddle.Tensor]],
-            **kwargs, ):
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]],
+        **kwargs,
+    ):
         r"""
         Load pretrained attention processor layers for
         [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
@@ -1184,8 +1149,9 @@ def _load_text_encoder_attn_procs(
         """
 
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -1198,8 +1164,7 @@ def _load_text_encoder_attn_procs(
         use_safetensors = kwargs.pop("use_safetensors", None)
         network_alpha = kwargs.pop("network_alpha", None)
 
-        if from_diffusers and use_safetensors and not is_safetensors_available(
-        ):
+        if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
                 "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
             )
@@ -1215,13 +1180,12 @@ def _load_text_encoder_attn_procs(
             if from_diffusers:
                 # Let's first try to load .safetensors weights
                 if (use_safetensors and weight_name is None) or (
-                        weight_name is not None and
-                        weight_name.endswith(".safetensors")):
+                    weight_name is not None and weight_name.endswith(".safetensors")
+                ):
                     try:
                         model_file = _get_model_file(
                             pretrained_model_name_or_path_or_dict,
-                            weights_name=weight_name or
-                            TORCH_LORA_WEIGHT_NAME_SAFE,
+                            weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
                             cache_dir=cache_dir,
                             force_download=force_download,
                             resume_download=resume_download,
@@ -1231,7 +1195,8 @@ def _load_text_encoder_attn_procs(
                             revision=revision,
                             subfolder=subfolder,
                             user_agent=user_agent,
-                            from_hf_hub=from_hf_hub, )
+                            from_hf_hub=from_hf_hub,
+                        )
                         state_dict = smart_load(model_file)
                     except Exception:
                         model_file = None
@@ -1249,7 +1214,8 @@ def _load_text_encoder_attn_procs(
                         revision=revision,
                         subfolder=subfolder,
                         user_agent=user_agent,
-                        from_hf_hub=from_hf_hub, )
+                        from_hf_hub=from_hf_hub,
+                    )
                     state_dict = smart_load(model_file)
             else:
                 model_file = _get_model_file(
@@ -1264,7 +1230,8 @@ def _load_text_encoder_attn_procs(
                     revision=revision,
                     subfolder=subfolder,
                     user_agent=user_agent,
-                    from_hf_hub=from_hf_hub, )
+                    from_hf_hub=from_hf_hub,
+                )
                 state_dict = smart_load(model_file)
         else:
             state_dict = pretrained_model_name_or_path_or_dict
@@ -1275,55 +1242,48 @@ def _load_text_encoder_attn_procs(
         is_lora = all("lora" in k for k in state_dict.keys())
 
         if from_diffusers or is_torch_file(model_file):
-            state_dict = transpose_state_dict(
-                state_dict, name_mapping={".encoder.": ".transformer."})
+            state_dict = transpose_state_dict(state_dict, name_mapping={".encoder.": ".transformer."})
 
         if is_lora:
             lora_grouped_dict = defaultdict(dict)
             for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(
-                    ".")[:-3]), ".".join(key.split(".")[-3:])
+                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
                 lora_grouped_dict[attn_processor_key][sub_key] = value.cast(
-                    dtype="float32")  # we must cast this to float32
+                    dtype="float32"
+                )  # we must cast this to float32
 
             for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[
-                    1]  # 0 -> 1, torch vs paddle nn.Linear
-                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[
-                    0]  # 1 -> 0, torch vs paddle nn.Linear
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[
-                    1]  # 0 -> 1, torch vs paddle nn.Linear
+                rank = value_dict["to_k_lora.down.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
+                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]  # 1 -> 0, torch vs paddle nn.Linear
+                hidden_size = value_dict["to_k_lora.up.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
 
                 attn_processors[key] = LoRAAttnProcessor(
                     hidden_size=hidden_size,
                     cross_attention_dim=cross_attention_dim,
                     rank=rank,
-                    network_alpha=network_alpha, )
+                    network_alpha=network_alpha,
+                )
                 attn_processors[key].load_dict(value_dict)
 
         else:
-            raise ValueError(
-                f"{model_file} does not seem to be in the correct format expected by LoRA training."
-            )
+            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
 
         # set correct dtype & device
-        attn_processors = {
-            k: v.to(dtype=self.text_encoder.dtype)
-            for k, v in attn_processors.items()
-        }
+        attn_processors = {k: v.to(dtype=self.text_encoder.dtype) for k, v in attn_processors.items()}
         return attn_processors
 
     @classmethod
     def save_lora_weights(
-            self,
-            save_directory: Union[str, os.PathLike],
-            unet_lora_layers: Dict[str, nn.Layer]=None,
-            text_encoder_lora_layers: Dict[str, nn.Layer]=None,
-            is_main_process: bool=True,
-            weight_name: str=None,
-            save_function: Callable=None,
-            safe_serialization: bool=False,
-            to_diffusers: Optional[bool]=None, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, nn.Layer] = None,
+        text_encoder_lora_layers: Dict[str, nn.Layer] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+        to_diffusers: Optional[bool] = None,
+    ):
         r"""
         Save the LoRA parameters corresponding to the UNet and the text encoder.
         Arguments:
@@ -1347,16 +1307,11 @@ def save_lora_weights(
         """
         if to_diffusers is None:
             to_diffusers = TO_DIFFUSERS
-        if to_diffusers and safe_serialization and not is_safetensors_available(
-        ):
-            raise ImportError(
-                "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
-            )
+        if to_diffusers and safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
 
         if os.path.isfile(save_directory):
-            logger.error(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
 
         os.makedirs(save_directory, exist_ok=True)
@@ -1372,8 +1327,7 @@ def save_lora_weights(
         if text_encoder_lora_layers is not None:
             text_encoder_lora_state_dict = {
                 f"{self.text_encoder_name}.{module_name}": param
-                for module_name, param in text_encoder_lora_layers.state_dict()
-                .items()
+                for module_name, param in text_encoder_lora_layers.state_dict().items()
             }
             state_dict.update(text_encoder_lora_state_dict)
             # TODO junnyu, rename paramaters.
@@ -1394,16 +1348,13 @@ def save_lora_weights(
                 if safe_serialization:
                     if is_torch_available():
                         _save_function = safetensors.torch.save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="torch")
+                        state_dict = convert_state_dict(state_dict, framework="torch")
                     else:
                         _save_function = safetensors.numpy.save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="numpy")
+                        state_dict = convert_state_dict(state_dict, framework="numpy")
 
                     def save_function(weights, filename):
-                        return _save_function(
-                            weights, filename, metadata={"format": "pt"})
+                        return _save_function(weights, filename, metadata={"format": "pt"})
 
                 else:
                     if not is_torch_available():
@@ -1411,17 +1362,13 @@ def save_function(weights, filename):
                             "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
                         )
                     save_function = torch.save
-                    state_dict = convert_state_dict(
-                        state_dict, framework="torch")
-                state_dict = transpose_state_dict(
-                    state_dict, name_mapping={".transformer.": ".encoder."})
+                    state_dict = convert_state_dict(state_dict, framework="torch")
+                state_dict = transpose_state_dict(state_dict, name_mapping={".transformer.": ".encoder."})
             else:
                 save_function = paddle.save
 
         save_function(state_dict, os.path.join(save_directory, weight_name))
-        logger.info(
-            f"Model weights saved in {os.path.join(save_directory, weight_name)}"
-        )
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
 
     def _convert_kohya_lora_to_diffusers(self, state_dict):
         unet_state_dict = {}
@@ -1442,62 +1389,36 @@ def _convert_kohya_lora_to_diffusers(self, state_dict):
                         raise ValueError("Network alpha is not consistent")
 
                 if lora_name.startswith("lora_unet_"):
-                    diffusers_name = key.replace("lora_unet_", "").replace("_",
-                                                                           ".")
-                    diffusers_name = diffusers_name.replace("down.blocks",
-                                                            "down_blocks")
-                    diffusers_name = diffusers_name.replace("mid.block",
-                                                            "mid_block")
-                    diffusers_name = diffusers_name.replace("up.blocks",
-                                                            "up_blocks")
-                    diffusers_name = diffusers_name.replace(
-                        "transformer.blocks", "transformer_blocks")
-                    diffusers_name = diffusers_name.replace("to.q.lora",
-                                                            "to_q_lora")
-                    diffusers_name = diffusers_name.replace("to.k.lora",
-                                                            "to_k_lora")
-                    diffusers_name = diffusers_name.replace("to.v.lora",
-                                                            "to_v_lora")
-                    diffusers_name = diffusers_name.replace("to.out.0.lora",
-                                                            "to_out_lora")
+                    diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
+                    diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+                    diffusers_name = diffusers_name.replace("mid.block", "mid_block")
+                    diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
+                    diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
+                    diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
+                    diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
+                    diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
+                    diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
                     if "transformer_blocks" in diffusers_name:
                         if "attn1" in diffusers_name or "attn2" in diffusers_name:
-                            diffusers_name = diffusers_name.replace(
-                                "attn1", "attn1.processor")
-                            diffusers_name = diffusers_name.replace(
-                                "attn2", "attn2.processor")
+                            diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
+                            diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
                             unet_state_dict[diffusers_name] = value
-                            unet_state_dict[diffusers_name.replace(
-                                ".down.", ".up.")] = state_dict[lora_name_up]
+                            unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
                 elif lora_name.startswith("lora_te_"):
-                    diffusers_name = key.replace("lora_te_", "").replace("_",
-                                                                         ".")
-                    diffusers_name = diffusers_name.replace("text.model",
-                                                            "text_model")
-                    diffusers_name = diffusers_name.replace("self.attn",
-                                                            "self_attn")
-                    diffusers_name = diffusers_name.replace("q.proj.lora",
-                                                            "to_q_lora")
-                    diffusers_name = diffusers_name.replace("k.proj.lora",
-                                                            "to_k_lora")
-                    diffusers_name = diffusers_name.replace("v.proj.lora",
-                                                            "to_v_lora")
-                    diffusers_name = diffusers_name.replace("out.proj.lora",
-                                                            "to_out_lora")
+                    diffusers_name = key.replace("lora_te_", "").replace("_", ".")
+                    diffusers_name = diffusers_name.replace("text.model", "text_model")
+                    diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+                    diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+                    diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+                    diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+                    diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
                     if "self_attn" in diffusers_name:
                         te_state_dict[diffusers_name] = value
-                        te_state_dict[diffusers_name.replace(
-                            ".down.", ".up.")] = state_dict[lora_name_up]
+                        te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
 
-        unet_state_dict = {
-            f"{UNET_NAME}.{module_name}": params
-            for module_name, params in unet_state_dict.items()
-        }
-        te_state_dict = {
-            f"{TEXT_ENCODER_NAME}.{module_name}": params
-            for module_name, params in te_state_dict.items()
-        }
-        new_state_dict = { ** unet_state_dict, ** te_state_dict}
+        unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()}
+        te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()}
+        new_state_dict = {**unet_state_dict, **te_state_dict}
         return new_state_dict, network_alpha
 
 
@@ -1582,12 +1503,14 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
         ```
         """
         # import here to avoid circular dependency
-        from .pipelines.stable_diffusion.convert_from_ckpt import \
-            download_from_original_stable_diffusion_ckpt
+        from .pipelines.stable_diffusion.convert_from_ckpt import (
+            download_from_original_stable_diffusion_ckpt,
+        )
 
         from_hf_hub = "huggingface.co" in pretrained_model_link_or_path or "hf.co"
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         resume_download = kwargs.pop("resume_download", False)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
@@ -1631,22 +1554,20 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
         pretrained_model_link_or_path = str(pretrained_model_link_or_path)
         if os.path.isfile(pretrained_model_link_or_path):
             checkpoint_path = pretrained_model_link_or_path
-        elif pretrained_model_link_or_path.startswith(
-                "http://") or pretrained_model_link_or_path.startswith(
-                    "https://"):
+        elif pretrained_model_link_or_path.startswith("http://") or pretrained_model_link_or_path.startswith(
+            "https://"
+        ):
             # HF Hub models
-            if any(p in pretrained_model_link_or_path
-                   for p in ["huggingface.co", "hf.co"]):
+            if any(p in pretrained_model_link_or_path for p in ["huggingface.co", "hf.co"]):
                 # remove huggingface url
                 for prefix in [
-                        "https://huggingface.co/",
-                        "huggingface.co/",
-                        "hf.co/",
-                        "https://hf.co/",
+                    "https://huggingface.co/",
+                    "huggingface.co/",
+                    "hf.co/",
+                    "https://hf.co/",
                 ]:
                     if pretrained_model_link_or_path.startswith(prefix):
-                        pretrained_model_link_or_path = pretrained_model_link_or_path[
-                            len(prefix):]
+                        pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
 
                 # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
                 ckpt_path = Path(pretrained_model_link_or_path)
@@ -1656,10 +1577,10 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
                     file_path = str(Path().joinpath(*ckpt_path.parts[2:]))
 
                     if file_path.startswith("blob/"):
-                        file_path = file_path[len("blob/"):]
+                        file_path = file_path[len("blob/") :]
 
                     if file_path.startswith("main/"):
-                        file_path = file_path[len("main/"):]
+                        file_path = file_path[len("main/") :]
 
                     checkpoint_path = hf_hub_download(
                         repo_id,
@@ -1670,17 +1591,18 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
                         local_files_only=local_files_only,
                         use_auth_token=use_auth_token,
                         revision=revision,
-                        force_download=force_download, )
+                        force_download=force_download,
+                    )
                 else:
                     checkpoint_path = ckpt_path
             else:
                 checkpoint_path = ppdiffusers_url_download(
                     pretrained_model_link_or_path,
                     cache_dir=cache_dir,
-                    filename=http_file_name(pretrained_model_link_or_path)
-                    .strip('"'),
+                    filename=http_file_name(pretrained_model_link_or_path).strip('"'),
                     force_download=force_download,
-                    resume_download=resume_download, )
+                    resume_download=resume_download,
+                )
         else:
             checkpoint_path = pretrained_model_link_or_path
 
@@ -1697,18 +1619,20 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
             upcast_attention=upcast_attention,
             load_safety_checker=load_safety_checker,
             prediction_type=prediction_type,
-            paddle_dtype=paddle_dtype, )
+            paddle_dtype=paddle_dtype,
+        )
 
         return pipe
 
 
 def http_file_name(
-        url: str,
-        *,
-        proxies=None,
-        headers: Optional[Dict[str, str]]=None,
-        timeout=10.0,
-        max_retries=0, ):
+    url: str,
+    *,
+    proxies=None,
+    headers: Optional[Dict[str, str]] = None,
+    timeout=10.0,
+    max_retries=0,
+):
     """
     Get a remote file name.
     """
@@ -1720,7 +1644,8 @@ def http_file_name(
         proxies=proxies,
         headers=headers,
         timeout=timeout,
-        max_retries=max_retries, )
+        max_retries=max_retries,
+    )
     hf_raise_for_status(r)
     displayed_name = url.split("/")[-1]
     content_disposition = r.headers.get("Content-Disposition")
diff --git a/ppdiffusers/ppdiffusers/models/__init__.py b/ppdiffusers/ppdiffusers/models/__init__.py
index 3269f70a0217e..19d5a1b254b83 100644
--- a/ppdiffusers/ppdiffusers/models/__init__.py
+++ b/ppdiffusers/ppdiffusers/models/__init__.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 # flake8: noqa
 
-from ..utils.import_utils import (OptionalDependencyNotAvailable,
-                                  is_einops_available, is_paddle_available)
+from ..utils.import_utils import (
+    OptionalDependencyNotAvailable,
+    is_einops_available,
+    is_paddle_available,
+)
 
 if is_paddle_available():
     from .adapter import MultiAdapter, T2IAdapter
diff --git a/ppdiffusers/ppdiffusers/models/adapter.py b/ppdiffusers/ppdiffusers/models/adapter.py
index f51292032a59c..639118f29b348 100644
--- a/ppdiffusers/ppdiffusers/models/adapter.py
+++ b/ppdiffusers/ppdiffusers/models/adapter.py
@@ -22,15 +22,7 @@
 
 
 class BottleneckResnetBlock(paddle.nn.Layer):
-    def __init__(self,
-                 in_c,
-                 mid_c,
-                 out_c,
-                 down,
-                 ksize=3,
-                 sk=False,
-                 use_conv=True,
-                 proj_ksize=1):
+    def __init__(self, in_c, mid_c, out_c, down, ksize=3, sk=False, use_conv=True, proj_ksize=1):
         super().__init__()
         ps = ksize // 2
         proj_pad = proj_ksize // 2
@@ -40,7 +32,8 @@ def __init__(self,
                 out_channels=mid_c,
                 kernel_size=proj_ksize,
                 stride=1,
-                padding=proj_pad, )
+                padding=proj_pad,
+            )
         else:
             self.conv1 = None
         if out_c != mid_c:
@@ -49,29 +42,27 @@ def __init__(self,
                 out_channels=out_c,
                 kernel_size=proj_ksize,
                 stride=1,
-                padding=proj_pad, )
+                padding=proj_pad,
+            )
         else:
             self.conv2 = None
-        self.block1 = paddle.nn.Conv2D(
-            in_channels=mid_c,
-            out_channels=mid_c,
-            kernel_size=3,
-            stride=1,
-            padding=1)
+        self.block1 = paddle.nn.Conv2D(in_channels=mid_c, out_channels=mid_c, kernel_size=3, stride=1, padding=1)
         self.act = paddle.nn.ReLU()
         self.block2 = paddle.nn.Conv2D(
             in_channels=mid_c,
             out_channels=mid_c,
             kernel_size=ksize,
             stride=1,
-            padding=ps, )
+            padding=ps,
+        )
         if sk is False:
             self.conv_shortcut = paddle.nn.Conv2D(
                 in_channels=in_c,
                 out_channels=mid_c,
                 kernel_size=ksize,
                 stride=1,
-                padding=ps, )
+                padding=ps,
+            )
         else:
             self.conv_shortcut = None
         self.down = down
@@ -136,20 +127,20 @@ class T2IAdapter(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            block_out_channels: List[int]=[320, 640, 1280, 1280],
-            block_mid_channels: Optional[List[int]]=None,
-            num_res_blocks: int=3,
-            channels_in: int=3,
-            kernel_size: int=3,
-            proj_kernel_size: int=1,
-            res_block_skip: bool=True,
-            use_conv: bool=False,
-            input_scale_factor: int=8, ):
+        self,
+        block_out_channels: List[int] = [320, 640, 1280, 1280],
+        block_mid_channels: Optional[List[int]] = None,
+        num_res_blocks: int = 3,
+        channels_in: int = 3,
+        kernel_size: int = 3,
+        proj_kernel_size: int = 1,
+        res_block_skip: bool = True,
+        use_conv: bool = False,
+        input_scale_factor: int = 8,
+    ):
         super(T2IAdapter, self).__init__()
         self.num_downsample_blocks = len(block_out_channels)
-        self.unshuffle = paddle.nn.PixelUnshuffle(
-            downscale_factor=input_scale_factor)
+        self.unshuffle = paddle.nn.PixelUnshuffle(downscale_factor=input_scale_factor)
         self.num_res_blocks = num_res_blocks
         self.body = []
         if block_mid_channels is None:
@@ -166,7 +157,9 @@ def __init__(
                             ksize=kernel_size,
                             proj_ksize=proj_kernel_size,
                             sk=res_block_skip,
-                            use_conv=use_conv, ))
+                            use_conv=use_conv,
+                        )
+                    )
                 elif j == num_res_blocks - 1:
                     self.body.append(
                         BottleneckResnetBlock(
@@ -177,7 +170,9 @@ def __init__(
                             ksize=kernel_size,
                             proj_ksize=proj_kernel_size,
                             sk=res_block_skip,
-                            use_conv=use_conv, ))
+                            use_conv=use_conv,
+                        )
+                    )
                 else:
                     self.body.append(
                         BottleneckResnetBlock(
@@ -188,7 +183,9 @@ def __init__(
                             ksize=kernel_size,
                             proj_ksize=proj_kernel_size,
                             sk=res_block_skip,
-                            use_conv=use_conv, ))
+                            use_conv=use_conv,
+                        )
+                    )
         self.body = paddle.nn.LayerList(sublayers=self.body)
         if block_mid_channels[0] == block_out_channels[0]:
             self.conv_in = paddle.nn.Conv2D(
@@ -196,14 +193,16 @@ def __init__(
                 out_channels=block_mid_channels[0],
                 kernel_size=3,
                 stride=1,
-                padding=1, )
+                padding=1,
+            )
         else:
             self.conv_in = paddle.nn.Conv2D(
                 in_channels=channels_in * input_scale_factor**2,
                 out_channels=block_mid_channels[0],
                 kernel_size=proj_kernel_size,
                 stride=1,
-                padding=proj_kernel_size // 2, )
+                padding=proj_kernel_size // 2,
+            )
 
     def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
         """
@@ -241,9 +240,7 @@ def __init__(self, adapters: List[T2IAdapter]):
         self.num_adapter = len(adapters)
         self.adapters = paddle.nn.LayerList(sublayers=adapters)
 
-    def forward(
-            self, xs: paddle.Tensor,
-            adapter_weights: Optional[List[float]]=None) -> List[paddle.Tensor]:
+    def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = None) -> List[paddle.Tensor]:
         """
         Args:
             xs (`torch.Tensor`):
@@ -254,8 +251,7 @@ def forward(
                 them together.
         """
         if adapter_weights is None:
-            adapter_weights = paddle.to_tensor([1 / self.num_adapter] *
-                                               self.num_adapter)
+            adapter_weights = paddle.to_tensor([1 / self.num_adapter] * self.num_adapter)
         else:
             adapter_weights = paddle.to_tensor(adapter_weights)
         if xs.shape[1] % self.num_adapter != 0:
diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py
index 47ae9ef9aa303..199e115810a3e 100644
--- a/ppdiffusers/ppdiffusers/models/attention.py
+++ b/ppdiffusers/ppdiffusers/models/attention.py
@@ -24,7 +24,7 @@
 from .embeddings import CombinedTimestepLabelEmbeddings
 
 
-def drop_path(input, drop_prob: float=0.0, training: bool=False):
+def drop_path(input, drop_prob: float = 0.0, training: bool = False):
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
@@ -37,8 +37,7 @@ def drop_path(input, drop_prob: float=0.0, training: bool=False):
     if drop_prob == 0.0 or not training:
         return input
     keep_prob = 1 - drop_prob
-    shape = (input.shape[0], ) + (1, ) * (
-        input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
     random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype)
     random_tensor = paddle.floor(random_tensor)  # binarize
     output = (input / keep_prob) * random_tensor
@@ -48,7 +47,7 @@ def drop_path(input, drop_prob: float=0.0, training: bool=False):
 class DropPath(nn.Layer):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
-    def __init__(self, drop_prob: Optional[float]=None) -> None:
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
         super().__init__()
         self.drop_prob = drop_prob
 
@@ -61,12 +60,13 @@ def extra_repr(self) -> str:
 
 class Mlp(nn.Layer):
     def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            drop=0.0, ):
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
@@ -103,22 +103,21 @@ class AttentionBlock(nn.Layer):
     # IMPORTANT;TODO(Patrick, William) - this class will be deprecated soon. Do not use it anymore
 
     def __init__(
-            self,
-            channels: int,
-            num_head_channels: Optional[int]=None,
-            norm_num_groups: int=32,
-            rescale_output_factor: float=1.0,
-            eps: float=1e-5, ):
+        self,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        norm_num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
         super().__init__()
         self.channels = channels
 
-        self.num_heads = (channels // num_head_channels
-                          if num_head_channels is not None else 1)
+        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
         self.head_size = self.channels // self.num_heads
         self.scale = 1 / math.sqrt(self.channels / self.num_heads)
 
-        self.group_norm = nn.GroupNorm(
-            num_channels=channels, num_groups=norm_num_groups, epsilon=eps)
+        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, epsilon=eps)
 
         # define q,k,v as linear layers
         self.query = nn.Linear(channels, channels)
@@ -132,10 +131,7 @@ def __init__(
         self._use_2_5_attn = True
         self._attention_op = None
 
-    def reshape_heads_to_batch_dim(self,
-                                   tensor,
-                                   transpose=True,
-                                   merge_head_and_batch=False):
+    def reshape_heads_to_batch_dim(self, tensor, transpose=True, merge_head_and_batch=False):
         tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
         # currently we donot use `unmerge_head_and_batch`
         if transpose or merge_head_and_batch:
@@ -145,15 +141,11 @@ def reshape_heads_to_batch_dim(self,
             tensor = tensor.flatten(0, 1)
         return tensor
 
-    def reshape_batch_dim_to_heads(self,
-                                   tensor,
-                                   transpose=True,
-                                   unmerge_head_and_batch=False):
+    def reshape_batch_dim_to_heads(self, tensor, transpose=True, unmerge_head_and_batch=False):
         # currently we donot use `unmerge_head_and_batch`
         if unmerge_head_and_batch:
             seq_len = tensor.shape[1]
-            tensor = tensor.reshape(
-                [-1, self.num_heads, seq_len, self.head_size])
+            tensor = tensor.reshape([-1, self.num_heads, seq_len, self.head_size])
 
         if transpose or unmerge_head_and_batch:
             tensor = tensor.transpose([0, 2, 1, 3])
@@ -162,9 +154,10 @@ def reshape_batch_dim_to_heads(self,
         return tensor
 
     def set_use_memory_efficient_attention_xformers(
-            self,
-            use_memory_efficient_attention_xformers: bool,
-            attention_op: Optional[str]=None, ):
+        self,
+        use_memory_efficient_attention_xformers: bool,
+        attention_op: Optional[str] = None,
+    ):
         # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
         # if self.head_size > 128 and attention_op == "flash":
         #     attention_op = "cutlass"
@@ -176,18 +169,15 @@ def set_use_memory_efficient_attention_xformers(
             else:
                 try:
                     _ = F.scaled_dot_product_attention_(
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op, )
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        attention_op=attention_op,
+                    )
                 except Exception as e:
                     raise e
 
-        self._use_memory_efficient_attention_xformers = (
-            use_memory_efficient_attention_xformers)
+        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
         self._attention_op = attention_op
 
     def forward(self, hidden_states):
@@ -197,8 +187,7 @@ def forward(self, hidden_states):
         # norm
         hidden_states = self.group_norm(hidden_states)
 
-        hidden_states = hidden_states.reshape(
-            [batch, channel, height * width]).transpose([0, 2, 1])
+        hidden_states = hidden_states.reshape([batch, channel, height * width]).transpose([0, 2, 1])
 
         # proj to q, k, v
         query_proj = self.query(hidden_states)
@@ -206,14 +195,14 @@ def forward(self, hidden_states):
         value_proj = self.value(hidden_states)
 
         query_proj = self.reshape_heads_to_batch_dim(
-            query_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            query_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
         key_proj = self.reshape_heads_to_batch_dim(
-            key_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            key_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
         value_proj = self.reshape_heads_to_batch_dim(
-            value_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            value_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
 
         if self._use_memory_efficient_attention_xformers:
             hidden_states = F.scaled_dot_product_attention_(
@@ -224,25 +213,22 @@ def forward(self, hidden_states):
                 scale=self.scale,
                 dropout_p=0.0,
                 training=self.training,
-                attention_op=self._attention_op, )
+                attention_op=self._attention_op,
+            )
         else:
-            attention_scores = (paddle.matmul(
-                query_proj, key_proj, transpose_y=True) * self.scale)
-            attention_probs = F.softmax(
-                attention_scores.cast("float32"),
-                axis=-1).cast(attention_scores.dtype)
+            attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
+            attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
             hidden_states = paddle.matmul(attention_probs, value_proj)
 
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(
-            hidden_states,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+        )
 
         # compute next hidden_states
         hidden_states = self.proj_attn(hidden_states)
 
-        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(
-            [batch, channel, height, width])
+        hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch, channel, height, width])
 
         # res connect and rescale
         hidden_states = (hidden_states + residual) / self.rescale_output_factor
@@ -271,31 +257,29 @@ class BasicTransformerBlock(nn.Layer):
     """
 
     def __init__(
-            self,
-            dim: int,
-            num_attention_heads: int,
-            attention_head_dim: int,
-            dropout=0.0,
-            cross_attention_dim: Optional[int]=None,
-            activation_fn: str="geglu",
-            num_embeds_ada_norm: Optional[int]=None,
-            attention_bias: bool=False,
-            only_cross_attention: bool=False,
-            double_self_attention: bool=False,
-            upcast_attention: bool=False,
-            norm_elementwise_affine: bool=True,
-            norm_type: str="layer_norm",
-            final_dropout: bool=False, ):
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
 
-        self.use_ada_layer_norm_zero = (
-            num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (
-            num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
 
-        if norm_type in ("ada_norm", "ada_norm_zero"
-                         ) and num_embeds_ada_norm is None:
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
             raise ValueError(
                 f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
                 f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
@@ -320,22 +304,21 @@ def __init__(
             dim_head=attention_head_dim,
             dropout=dropout,
             bias=attention_bias,
-            cross_attention_dim=cross_attention_dim
-            if only_cross_attention else None,
-            upcast_attention=upcast_attention, )
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
 
         # 2. Cross-Attn
         if cross_attention_dim is not None or double_self_attention:
             # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
             # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
             # the second cross attention block.
-            self.norm2 = (AdaLayerNorm(dim, num_embeds_ada_norm)
-                          if self.use_ada_layer_norm else
-                          nn.LayerNorm(dim, **norm_kwargs))
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim, **norm_kwargs)
+            )
             self.attn2 = Attention(
                 query_dim=dim,
-                cross_attention_dim=cross_attention_dim
-                if not double_self_attention else None,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
@@ -352,46 +335,45 @@ def __init__(
             dim,
             dropout=dropout,
             activation_fn=activation_fn,
-            final_dropout=final_dropout, )
+            final_dropout=final_dropout,
+        )
 
     def forward(
-            self,
-            hidden_states,
-            attention_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            timestep=None,
-            cross_attention_kwargs=None,
-            class_labels=None, ):
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        cross_attention_kwargs=None,
+        class_labels=None,
+    ):
         # Notice that normalization is always applied before the real computation in the following blocks.
         # 1. Self-Attention
         if self.use_ada_layer_norm:
             norm_hidden_states = self.norm1(hidden_states, timestep)
         elif self.use_ada_layer_norm_zero:
             norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states,
-                timestep,
-                class_labels,
-                hidden_dtype=hidden_states.dtype)
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
         else:
             norm_hidden_states = self.norm1(hidden_states)
 
-        cross_attention_kwargs = (cross_attention_kwargs if
-                                  cross_attention_kwargs is not None else {})
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         attn_output = self.attn1(
             norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states
-            if self.only_cross_attention else None,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
             attention_mask=attention_mask,
-            **cross_attention_kwargs, )
+            **cross_attention_kwargs,
+        )
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
 
         if self.attn2 is not None:
-            norm_hidden_states = (self.norm2(hidden_states, timestep)
-                                  if self.use_ada_layer_norm else
-                                  self.norm2(hidden_states))
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
             # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
             # prepare attention mask here
 
@@ -400,15 +382,15 @@ def forward(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
             hidden_states = attn_output + hidden_states
 
         # 3. Feed-forward
         norm_hidden_states = self.norm3(hidden_states)
 
         if self.use_ada_layer_norm_zero:
-            norm_hidden_states = (norm_hidden_states *
-                                  (1 + scale_mlp[:, None]) + shift_mlp[:, None])
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
 
         ff_output = self.ff(norm_hidden_states)
 
@@ -434,13 +416,14 @@ class FeedForward(nn.Layer):
     """
 
     def __init__(
-            self,
-            dim: int,
-            dim_out: Optional[int]=None,
-            mult: int=4,
-            dropout: float=0.0,
-            activation_fn: str="geglu",
-            final_dropout: bool=False, ):
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
@@ -476,7 +459,7 @@ class GELU(nn.Layer):
     GELU activation function with tanh approximation support with `approximate="tanh"`.
     """
 
-    def __init__(self, dim_in: int, dim_out: int, approximate: str="none"):
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
         super().__init__()
         self.proj = nn.Linear(dim_in, dim_out)
         self.approximate = approximate
@@ -552,22 +535,17 @@ class AdaLayerNormZero(nn.Layer):
     def __init__(self, embedding_dim, num_embeddings):
         super().__init__()
 
-        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings,
-                                                   embedding_dim)
+        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
 
         self.silu = nn.Silu()
-        self.linear = nn.Linear(
-            embedding_dim, 6 * embedding_dim, bias_attr=True)
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
         # elementwise_affine=False
         norm_kwargs = {"weight_attr": False, "bias_attr": False}
         self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
 
     def forward(self, x, timestep, class_labels, hidden_dtype=None):
-        emb = self.linear(
-            self.silu(
-                self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(
-            6, axis=1)
+        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 
@@ -578,12 +556,13 @@ class AdaGroupNorm(nn.Layer):
     """
 
     def __init__(
-            self,
-            embedding_dim: int,
-            out_dim: int,
-            num_groups: int,
-            act_fn: Optional[str]=None,
-            eps: float=1e-5, ):
+        self,
+        embedding_dim: int,
+        out_dim: int,
+        num_groups: int,
+        act_fn: Optional[str] = None,
+        eps: float = 1e-5,
+    ):
         super().__init__()
         self.num_groups = num_groups
         self.eps = eps
@@ -600,8 +579,7 @@ def __init__(
         self.linear = nn.Linear(embedding_dim, out_dim * 2)
         # elementwise_affine=False
         norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.group_norm = nn.GroupNorm(
-            num_groups, out_dim, epsilon=eps, **norm_kwargs)
+        self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
         self.group_norm.weight = None
         self.group_norm.bias = None
 
diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
index 506c08b6c76b0..e2c4770f3398a 100644
--- a/ppdiffusers/ppdiffusers/models/attention_processor.py
+++ b/ppdiffusers/ppdiffusers/models/attention_processor.py
@@ -40,27 +40,27 @@ class Attention(nn.Layer):
     """
 
     def __init__(
-            self,
-            query_dim: int,
-            cross_attention_dim: Optional[int]=None,
-            heads: int=8,
-            dim_head: int=64,
-            dropout: float=0.0,
-            bias=False,
-            upcast_attention: bool=False,
-            upcast_softmax: bool=False,
-            cross_attention_norm: Optional[str]=None,
-            cross_attention_norm_num_groups: int=32,
-            added_kv_proj_dim: Optional[int]=None,
-            norm_num_groups: Optional[int]=None,
-            out_bias: bool=True,
-            scale_qk: bool=True,
-            only_cross_attention: bool=False,
-            processor: Optional["AttnProcessor"]=None, ):
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+    ):
         super().__init__()
         inner_dim = dim_head * heads
-        cross_attention_dim = (cross_attention_dim if
-                               cross_attention_dim is not None else query_dim)
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
         self.upcast_attention = upcast_attention
         self.upcast_softmax = upcast_softmax
 
@@ -82,10 +82,7 @@ def __init__(
             )
 
         if norm_num_groups is not None:
-            self.group_norm = nn.GroupNorm(
-                num_channels=query_dim,
-                num_groups=norm_num_groups,
-                epsilon=1e-5)
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, epsilon=1e-5)
         else:
             self.group_norm = None
 
@@ -107,7 +104,8 @@ def __init__(
             self.norm_cross = nn.GroupNorm(
                 num_channels=norm_cross_num_channels,
                 num_groups=cross_attention_norm_num_groups,
-                epsilon=1e-5, )
+                epsilon=1e-5,
+            )
         else:
             raise ValueError(
                 f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
@@ -117,10 +115,8 @@ def __init__(
 
         if not self.only_cross_attention:
             # only relevant for the `AddedKVProcessor` classes
-            self.to_k = nn.Linear(
-                cross_attention_dim, inner_dim, bias_attr=bias)
-            self.to_v = nn.Linear(
-                cross_attention_dim, inner_dim, bias_attr=bias)
+            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
+            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
         else:
             self.to_k = None
             self.to_v = None
@@ -140,15 +136,17 @@ def __init__(
         self.set_processor(processor)
 
     def set_use_memory_efficient_attention_xformers(
-            self,
-            use_memory_efficient_attention_xformers: bool,
-            attention_op: Optional[str]=None, ):
+        self,
+        use_memory_efficient_attention_xformers: bool,
+        attention_op: Optional[str] = None,
+    ):
         is_lora = hasattr(self, "processor") and isinstance(
-            self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor))
+            self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)
+        )
         is_custom_diffusion = hasattr(self, "processor") and isinstance(
             self.processor,
-            (CustomDiffusionAttnProcessor,
-             CustomDiffusionXFormersAttnProcessor), )
+            (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor),
+        )
         is_added_kv = self.added_kv_proj_dim is not None
         if use_memory_efficient_attention_xformers:
             # if self.added_kv_proj_dim is not None:
@@ -167,13 +165,11 @@ def set_use_memory_efficient_attention_xformers(
                 try:
                     # Make sure we can run the memory efficient attention
                     _ = F.scaled_dot_product_attention_(
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op, )
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        attention_op=attention_op,
+                    )
                 except Exception as e:
                     raise e
             # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
@@ -184,7 +180,8 @@ def set_use_memory_efficient_attention_xformers(
                     hidden_size=self.processor.hidden_size,
                     cross_attention_dim=self.processor.cross_attention_dim,
                     rank=self.processor.rank,
-                    attention_op=attention_op, )
+                    attention_op=attention_op,
+                )
                 # we must cast dtype
                 processor.to(dtype=self.dtype)
                 processor.load_dict(self.processor.state_dict())
@@ -194,13 +191,13 @@ def set_use_memory_efficient_attention_xformers(
                     train_q_out=self.processor.train_q_out,
                     hidden_size=self.processor.hidden_size,
                     cross_attention_dim=self.processor.cross_attention_dim,
-                    attention_op=attention_op, )
+                    attention_op=attention_op,
+                )
                 # we must cast dtype
                 processor.to(dtype=self.dtype)
                 processor.load_dict(self.processor.state_dict())
             elif is_added_kv:
-                processor = XFormersAttnAddedKVProcessor(
-                    attention_op=attention_op)
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
             else:
                 processor = XFormersAttnProcessor(attention_op=attention_op)
         else:
@@ -208,7 +205,8 @@ def set_use_memory_efficient_attention_xformers(
                 processor = LoRAAttnProcessor(
                     hidden_size=self.processor.hidden_size,
                     cross_attention_dim=self.processor.cross_attention_dim,
-                    rank=self.processor.rank, )
+                    rank=self.processor.rank,
+                )
                 # we must cast dtype
                 processor.to(dtype=self.dtype)
                 processor.load_dict(self.processor.state_dict())
@@ -217,7 +215,8 @@ def set_use_memory_efficient_attention_xformers(
                     train_kv=self.processor.train_kv,
                     train_q_out=self.processor.train_q_out,
                     hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim, )
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
                 # we must cast dtype
                 processor.to(dtype=self.dtype)
                 processor.load_dict(self.processor.state_dict())
@@ -230,9 +229,7 @@ def set_use_memory_efficient_attention_xformers(
 
     def set_attention_slice(self, slice_size):
         if slice_size is not None and slice_size > self.sliceable_head_dim:
-            raise ValueError(
-                f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}."
-            )
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
 
         if slice_size is not None and self.added_kv_proj_dim is not None:
             processor = SlicedAttnAddedKVProcessor(slice_size)
@@ -248,22 +245,19 @@ def set_attention_slice(self, slice_size):
     def set_processor(self, processor: "AttnProcessor"):
         # if current processor is in `self._sub_layers` and if passed `processor` is not, we need to
         # pop `processor` from `self._sub_layers`
-        if (hasattr(self, "processor") and
-                isinstance(self.processor, nn.Layer) and
-                not isinstance(processor, nn.Layer)):
-            logger.info(
-                f"You are removing possibly trained weights of {self.processor} with {processor}"
-            )
+        if hasattr(self, "processor") and isinstance(self.processor, nn.Layer) and not isinstance(processor, nn.Layer):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
             self._sub_layers.pop("processor")
 
         self.processor = processor
 
     def forward(
-            self,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
         # The `Attention` class can call different attention processors / attention functions
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -272,14 +266,14 @@ def forward(
             hidden_states,
             encoder_hidden_states=encoder_hidden_states,
             attention_mask=attention_mask,
-            **cross_attention_kwargs, )
+            **cross_attention_kwargs,
+        )
 
     def batch_to_head_dim(self, tensor, transpose=True, in_dim=4):
         if in_dim == 3:
             head_size = self.heads
             batch_size, seq_len, dim = tensor.shape
-            tensor = tensor.reshape(
-                [batch_size // head_size, head_size, seq_len, dim])
+            tensor = tensor.reshape([batch_size // head_size, head_size, seq_len, dim])
         if transpose:
             tensor = tensor.transpose([0, 2, 1, 3])
         tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
@@ -301,8 +295,7 @@ def get_attention_scores(self, query, key, attention_mask=None):
             query = query.cast(paddle.float32)
             key = key.cast(paddle.float32)
 
-        attention_scores = paddle.matmul(
-            query, key, transpose_y=True) * self.scale
+        attention_scores = paddle.matmul(query, key, transpose_y=True) * self.scale
 
         if attention_mask is not None:
             attention_scores = attention_scores + attention_mask
@@ -317,12 +310,7 @@ def get_attention_scores(self, query, key, attention_mask=None):
 
         return attention_probs
 
-    def prepare_attention_mask(self,
-                               attention_mask,
-                               target_length,
-                               batch_size=None,
-                               out_dim=4,
-                               transpose=True):
+    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=4, transpose=True):
         if batch_size is None:
             deprecate(
                 "batch_size=None",
@@ -331,7 +319,8 @@ def prepare_attention_mask(self,
                     "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
                     " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
                     " `prepare_attention_mask` when preparing the attention_mask."
-                ), )
+                ),
+            )
             batch_size = 1
 
         num_heads = self.heads
@@ -339,21 +328,15 @@ def prepare_attention_mask(self,
             return attention_mask
 
         if attention_mask.shape[-1] != target_length:
-            attention_mask = F.pad(attention_mask, (0, target_length),
-                                   value=0.0,
-                                   data_format="NCL")
+            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0, data_format="NCL")
         if out_dim == 3:
             if attention_mask.shape[0] < batch_size * num_heads:
-                attention_mask = attention_mask.repeat_interleave(
-                    num_heads, axis=0)
+                attention_mask = attention_mask.repeat_interleave(num_heads, axis=0)
         elif out_dim == 4:
             attention_mask = attention_mask.unsqueeze(1)
             if attention_mask.shape[0] < batch_size * num_heads:
-                attention_mask = attention_mask.repeat_interleave(
-                    num_heads, axis=1)
-            attention_mask = paddle.reshape(
-                attention_mask,
-                [batch_size, num_heads, -1, attention_mask.shape[-1]])
+                attention_mask = attention_mask.repeat_interleave(num_heads, axis=1)
+            attention_mask = paddle.reshape(attention_mask, [batch_size, num_heads, -1, attention_mask.shape[-1]])
 
         if attention_mask.ndim == 4:
             if not transpose:
@@ -361,9 +344,7 @@ def prepare_attention_mask(self,
         return attention_mask
 
     def norm_encoder_hidden_states(self, encoder_hidden_states):
-        assert (
-            self.norm_cross is not None
-        ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
 
         if isinstance(self.norm_cross, nn.LayerNorm):
             encoder_hidden_states = self.norm_cross(encoder_hidden_states)
@@ -384,24 +365,23 @@ def norm_encoder_hidden_states(self, encoder_hidden_states):
 
 class AttnProcessor:
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
@@ -427,9 +407,7 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None):
         super().__init__()
 
         if rank > min(in_features, out_features):
-            raise ValueError(
-                f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}"
-            )
+            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
 
         self.down = nn.Linear(in_features, rank, bias_attr=False)
         self.up = nn.Linear(rank, out_features, bias_attr=False)
@@ -469,39 +447,31 @@ class LoRAAttnProcessor(nn.Layer):
             Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
     """
 
-    def __init__(self,
-                 hidden_size,
-                 cross_attention_dim=None,
-                 rank=4,
-                 network_alpha=None):
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
         super().__init__()
 
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
         self.rank = rank
 
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
-                                         network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
-                                         hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
-                                         hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
-                                           network_alpha)
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            scale=1.0,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
 
         query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
         query = attn.head_to_batch_dim(query)
@@ -509,13 +479,10 @@ def __call__(
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(
-            encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(
-            encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
 
         key = attn.head_to_batch_dim(key)
         value = attn.head_to_batch_dim(value)
@@ -525,8 +492,7 @@ def __call__(
         hidden_states = attn.batch_to_head_dim(hidden_states)
 
         # linear proj
-        hidden_states = attn.to_out[0](
-            hidden_states) + scale * self.to_out_lora(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
@@ -535,13 +501,14 @@ def __call__(
 
 class CustomDiffusionAttnProcessor(nn.Layer):
     def __init__(
-            self,
-            train_kv=True,
-            train_q_out=True,
-            hidden_size=None,
-            cross_attention_dim=None,
-            out_bias=True,
-            dropout=0.0, ):
+        self,
+        train_kv=True,
+        train_q_out=True,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+    ):
         super().__init__()
         self.train_kv = train_kv
         self.train_q_out = train_q_out
@@ -551,35 +518,26 @@ def __init__(
 
         # `_custom_diffusion` id for easy serialization and loading.
         if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(
-                cross_attention_dim or hidden_size,
-                hidden_size,
-                bias_attr=False)
-            self.to_v_custom_diffusion = nn.Linear(
-                cross_attention_dim or hidden_size,
-                hidden_size,
-                bias_attr=False)
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
         if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(
-                hidden_size, hidden_size, bias_attr=False)
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False)
             self.to_out_custom_diffusion = nn.LayerList([])
-            self.to_out_custom_diffusion.append(
-                nn.Linear(
-                    hidden_size, hidden_size, bias_attr=out_bias))
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias))
             self.to_out_custom_diffusion.append(nn.Dropout(dropout))
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         if self.train_q_out:
             query = self.to_q_custom_diffusion(hidden_states)
         else:
@@ -591,8 +549,7 @@ def __call__(
         else:
             crossattn = True
             if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(
-                    encoder_hidden_states)
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         if self.train_kv:
             key = self.to_k_custom_diffusion(encoder_hidden_states)
@@ -631,40 +588,35 @@ def __call__(
 
 class AttnAddedKVProcessor:
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
         residual = hidden_states
-        hidden_states = hidden_states.reshape(
-            [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-                [0, 2, 1])
+        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+            [0, 2, 1]
+        )
         batch_size, sequence_length, _ = hidden_states.shape
 
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        hidden_states = attn.group_norm(hidden_states.transpose(
-            [0, 2, 1])).transpose([0, 2, 1])
+        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
 
         query = attn.to_q(hidden_states)
         query = attn.head_to_batch_dim(query)
 
         encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(
-            encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(
-            encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(
-            encoder_hidden_states_value_proj)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
 
         if not attn.only_cross_attention:
             key = attn.to_k(hidden_states)
@@ -672,8 +624,7 @@ def __call__(
             key = attn.head_to_batch_dim(key)
             value = attn.head_to_batch_dim(value)
             key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
-            value = paddle.concat(
-                [encoder_hidden_states_value_proj, value], axis=2)
+            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
         else:
             key = encoder_hidden_states_key_proj
             value = encoder_hidden_states_value_proj
@@ -687,53 +638,47 @@ def __call__(
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
-        hidden_states = hidden_states.transpose(
-            [0, 2, 1]).reshape(residual.shape)
+        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
         hidden_states = hidden_states + residual
 
         return hidden_states
 
 
 class XFormersAttnAddedKVProcessor:
-    def __init__(self, attention_op: Optional[str]=None):
+    def __init__(self, attention_op: Optional[str] = None):
         assert attention_op in [None, "auto", "cutlass", "flash"]
         self.attention_op = attention_op
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
         residual = hidden_states
-        hidden_states = hidden_states.reshape(
-            [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-                [0, 2, 1])
+        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+            [0, 2, 1]
+        )
         batch_size, sequence_length, _ = hidden_states.shape
 
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size, transpose=False)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        hidden_states = attn.group_norm(hidden_states.transpose(
-            [0, 2, 1])).transpose([0, 2, 1])
+        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
 
         query = attn.to_q(hidden_states)
         query = attn.head_to_batch_dim(query, transpose=False)
 
         encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(
-            encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(
-            encoder_hidden_states_key_proj, transpose=False)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(
-            encoder_hidden_states_value_proj, transpose=False)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, transpose=False)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, transpose=False)
 
         if not attn.only_cross_attention:
             key = attn.to_k(hidden_states)
@@ -741,8 +686,7 @@ def __call__(
             key = attn.head_to_batch_dim(key, transpose=False)
             value = attn.head_to_batch_dim(value, transpose=False)
             key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
-            value = paddle.concat(
-                [encoder_hidden_states_value_proj, value], axis=1)
+            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
         else:
             key = encoder_hidden_states_key_proj
             value = encoder_hidden_states_value_proj
@@ -755,7 +699,8 @@ def __call__(
             scale=attn.scale,
             dropout_p=0.0,
             training=attn.training,
-            attention_op=self.attention_op, )
+            attention_op=self.attention_op,
+        )
         hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
 
         # linear proj
@@ -763,39 +708,37 @@ def __call__(
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
-        hidden_states = hidden_states.transpose(
-            [0, 2, 1]).reshape(residual.shape)
+        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
         hidden_states = hidden_states + residual
 
         return hidden_states
 
 
 class XFormersAttnProcessor:
-    def __init__(self, attention_op: Optional[str]=None):
+    def __init__(self, attention_op: Optional[str] = None):
         assert attention_op in [None, "auto", "cutlass", "flash"]
         self.attention_op = attention_op
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size, transpose=False)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
 
         query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
@@ -813,7 +756,8 @@ def __call__(
             scale=attn.scale,
             dropout_p=0.0,
             training=attn.training,
-            attention_op=self.attention_op, )
+            attention_op=self.attention_op,
+        )
 
         # hidden_states = hidden_states.cast(query.dtype)
         hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
@@ -847,12 +791,13 @@ class LoRAXFormersAttnProcessor(nn.Layer):
     """
 
     def __init__(
-            self,
-            hidden_size,
-            cross_attention_dim,
-            rank=4,
-            attention_op: Optional[str]=None,
-            network_alpha=None, ):
+        self,
+        hidden_size,
+        cross_attention_dim,
+        rank=4,
+        attention_op: Optional[str] = None,
+        network_alpha=None,
+    ):
         super().__init__()
 
         self.hidden_size = hidden_size
@@ -860,28 +805,24 @@ def __init__(
         self.rank = rank
         self.attention_op = attention_op
 
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
-                                         network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
-                                         hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size,
-                                         hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank,
-                                           network_alpha)
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            scale=1.0,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size, transpose=False)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
 
         query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
         query = attn.head_to_batch_dim(query, transpose=False)
@@ -889,13 +830,10 @@ def __call__(
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(
-            encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(
-            encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
 
         key = attn.head_to_batch_dim(key, transpose=False)
         value = attn.head_to_batch_dim(value, transpose=False)
@@ -908,13 +846,13 @@ def __call__(
             scale=attn.scale,
             dropout_p=0.0,
             training=attn.training,
-            attention_op=self.attention_op, )
+            attention_op=self.attention_op,
+        )
 
         hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
 
         # linear proj
-        hidden_states = attn.to_out[0](
-            hidden_states) + scale * self.to_out_lora(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
@@ -923,14 +861,15 @@ def __call__(
 
 class CustomDiffusionXFormersAttnProcessor(nn.Layer):
     def __init__(
-            self,
-            train_kv=True,
-            train_q_out=False,
-            hidden_size=None,
-            cross_attention_dim=None,
-            out_bias=True,
-            dropout=0.0,
-            attention_op: Optional[str]=None, ):
+        self,
+        train_kv=True,
+        train_q_out=False,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+        attention_op: Optional[str] = None,
+    ):
         super().__init__()
         assert attention_op in [None, "auto", "cutlass", "flash"]
         self.train_kv = train_kv
@@ -942,36 +881,27 @@ def __init__(
 
         # `_custom_diffusion` id for easy serialization and loading.
         if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(
-                cross_attention_dim or hidden_size,
-                hidden_size,
-                bias_attr=False)
-            self.to_v_custom_diffusion = nn.Linear(
-                cross_attention_dim or hidden_size,
-                hidden_size,
-                bias_attr=False)
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
         if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(
-                hidden_size, hidden_size, bias_attr=False)
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False)
             self.to_out_custom_diffusion = nn.LayerList([])
-            self.to_out_custom_diffusion.append(
-                nn.Linear(
-                    hidden_size, hidden_size, bias_attr=out_bias))
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias))
             self.to_out_custom_diffusion.append(nn.Dropout(dropout))
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size, transpose=False)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
 
         if self.train_q_out:
             query = self.to_q_custom_diffusion(hidden_states)
@@ -984,8 +914,7 @@ def __call__(
         else:
             crossattn = True
             if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(
-                    encoder_hidden_states)
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         if self.train_kv:
             key = self.to_k_custom_diffusion(encoder_hidden_states)
@@ -1013,7 +942,8 @@ def __call__(
             scale=attn.scale,
             dropout_p=0.0,
             training=attn.training,
-            attention_op=self.attention_op, )
+            attention_op=self.attention_op,
+        )
         # hidden_states = hidden_states.cast(query.dtype)
         hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
 
@@ -1035,17 +965,17 @@ def __init__(self, slice_size):
         self.slice_size = slice_size
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
-        batch_size, sequence_length, _ = (hidden_states.shape
-                                          if encoder_hidden_states is None else
-                                          encoder_hidden_states.shape)
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size, out_dim=3)
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3)
 
         query = attn.to_q(hidden_states)
         query = attn.head_to_batch_dim(query)
@@ -1053,8 +983,7 @@ def __call__(
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
@@ -1067,27 +996,23 @@ def __call__(
 
         batch_size_attention = query.shape[0]
         query_len = query.shape[1]
-        hidden_states = paddle.zeros(
-            (batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
+        hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
         for i in range(batch_size_attention // self.slice_size):
             start_idx = i * self.slice_size
             end_idx = (i + 1) * self.slice_size
 
             query_slice = query[start_idx:end_idx]
             key_slice = key[start_idx:end_idx]
-            attn_mask_slice = (attention_mask[start_idx:end_idx]
-                               if attention_mask is not None else None)
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
 
-            attn_slice = attn.get_attention_scores(query_slice, key_slice,
-                                                   attn_mask_slice)
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
 
             attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx])
 
             hidden_states[start_idx:end_idx] = attn_slice
 
         # reshape back to [bs, num_heads, seqlen, head_dim]
-        hidden_states = hidden_states.reshape(
-            [-1, attn.heads, query_len, attn.head_dim])
+        hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim])
 
         hidden_states = attn.batch_to_head_dim(hidden_states)
         # linear proj
@@ -1103,42 +1028,37 @@ def __init__(self, slice_size):
         self.slice_size = slice_size
 
     def __call__(
-            self,
-            attn: "Attention",
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            **cross_attention_kwargs, ):
+        self,
+        attn: "Attention",
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        **cross_attention_kwargs,
+    ):
         residual = hidden_states
-        hidden_states = hidden_states.reshape(
-            [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-                [0, 2, 1])
+        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+            [0, 2, 1]
+        )
 
         batch_size, sequence_length, _ = hidden_states.shape
 
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size, out_dim=3)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        hidden_states = attn.group_norm(hidden_states.transpose(
-            [0, 2, 1])).transpose([0, 2, 1])
+        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
 
         query = attn.to_q(hidden_states)
         query = attn.head_to_batch_dim(query)
 
         encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(
-            encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
 
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(
-            encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(
-            encoder_hidden_states_value_proj)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
 
         if not attn.only_cross_attention:
             key = attn.to_k(hidden_states)
@@ -1146,8 +1066,7 @@ def __call__(
             key = attn.head_to_batch_dim(key)
             value = attn.head_to_batch_dim(value)
             key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
-            value = paddle.concat(
-                [encoder_hidden_states_value_proj, value], axis=2)
+            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
         else:
             key = encoder_hidden_states_key_proj
             value = encoder_hidden_states_value_proj
@@ -1159,8 +1078,7 @@ def __call__(
 
         batch_size_attention = query.shape[0]
         query_len = query.shape[1]
-        hidden_states = paddle.zeros(
-            (batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
+        hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
 
         for i in range(batch_size_attention // self.slice_size):
             start_idx = i * self.slice_size
@@ -1168,19 +1086,16 @@ def __call__(
 
             query_slice = query[start_idx:end_idx]
             key_slice = key[start_idx:end_idx]
-            attn_mask_slice = (attention_mask[start_idx:end_idx]
-                               if attention_mask is not None else None)
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
 
-            attn_slice = attn.get_attention_scores(query_slice, key_slice,
-                                                   attn_mask_slice)
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
 
             attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx])
 
             hidden_states[start_idx:end_idx] = attn_slice
 
         # reshape back to [bs, num_heads, seqlen, head_dim]
-        hidden_states = hidden_states.reshape(
-            [-1, attn.heads, query_len, attn.head_dim])
+        hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim])
 
         hidden_states = attn.batch_to_head_dim(hidden_states)
 
@@ -1189,8 +1104,7 @@ def __call__(
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
-        hidden_states = hidden_states.transpose(
-            [0, 2, 1]).reshape(residual.shape)
+        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
         hidden_states = hidden_states + residual
 
         return hidden_states
@@ -1200,9 +1114,17 @@ def __call__(
 AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
 LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
 AttentionProcessor = Union[
-    AttnProcessor, AttnProcessor2_5, XFormersAttnProcessor, SlicedAttnProcessor,
-    AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5,
-    XFormersAttnAddedKVProcessor, LoRAAttnProcessor, LoRAXFormersAttnProcessor,
-    LoRAAttnProcessor2_5, CustomDiffusionAttnProcessor,
+    AttnProcessor,
+    AttnProcessor2_5,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_5,
+    XFormersAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnProcessor2_5,
+    CustomDiffusionAttnProcessor,
     CustomDiffusionXFormersAttnProcessor,
 ]
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
index 3d3b531d927e3..69d1b0fb98bb2 100644
--- a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+++ b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
@@ -69,29 +69,30 @@ class AutoencoderKL(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            in_channels: int=3,
-            out_channels: int=3,
-            down_block_types: Tuple[str]=("DownEncoderBlock2D", ),
-            down_block_out_channels: Tuple[int]=None,
-            up_block_types: Tuple[str]=("UpDecoderBlock2D", ),
-            up_block_out_channels: Tuple[int]=None,
-            block_out_channels: Tuple[int]=(64, ),
-            layers_per_block: int=1,
-            act_fn: str="silu",
-            latent_channels: int=4,
-            norm_num_groups: int=32,
-            sample_size: int=32,
-            scaling_factor: float=0.18215, ):
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        down_block_out_channels: Tuple[int] = None,
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        up_block_out_channels: Tuple[int] = None,
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+    ):
         super().__init__()
         # if down_block_out_channels not givien, we will use block_out_channels
-        _down_block_out_channels = (self.config.block_out_channels
-                                    if down_block_out_channels is None else
-                                    self.config.down_block_out_channels)
+        _down_block_out_channels = (
+            self.config.block_out_channels if down_block_out_channels is None else self.config.down_block_out_channels
+        )
         # if up_block_out_channels not givien, we will use block_out_channels
-        _up_block_out_channels = (self.config.block_out_channels
-                                  if up_block_out_channels is None else
-                                  self.config.up_block_out_channels)
+        _up_block_out_channels = (
+            self.config.block_out_channels if up_block_out_channels is None else self.config.up_block_out_channels
+        )
 
         # pass init params to Encoder
         self.encoder = Encoder(
@@ -102,7 +103,8 @@ def __init__(
             layers_per_block=layers_per_block,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
-            double_z=True, )
+            double_z=True,
+        )
 
         # pass init params to Decoder
         self.decoder = Decoder(
@@ -112,7 +114,8 @@ def __init__(
             block_out_channels=_up_block_out_channels,
             layers_per_block=layers_per_block,
             norm_num_groups=norm_num_groups,
-            act_fn=act_fn, )
+            act_fn=act_fn,
+        )
 
         self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
         self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1)
@@ -122,18 +125,19 @@ def __init__(
 
         # only relevant if vae tiling is enabled
         self.tile_sample_min_size = self.config.sample_size
-        sample_size = (self.config.sample_size[0]
-                       if isinstance(self.config.sample_size, (list, tuple))
-                       else self.config.sample_size)
-        self.tile_latent_min_size = int(sample_size /
-                                        (2**(len(_up_block_out_channels) - 1)))
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(_up_block_out_channels) - 1)))
         self.tile_overlap_factor = 0.25
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Encoder, Decoder)):
             module.gradient_checkpointing = value
 
-    def enable_tiling(self, use_tiling: bool=True):
+    def enable_tiling(self, use_tiling: bool = True):
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
         compute decoding and encoding in several steps. This is useful to save a large amount of memory and to allow
@@ -163,12 +167,10 @@ def disable_slicing(self):
         self.use_slicing = False
 
     @apply_forward_hook
-    def encode(self, x: paddle.Tensor,
-               return_dict: bool=True) -> AutoencoderKLOutput:
+    def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
         # TODO junnyu, support float16
         x = x.cast(self.encoder.conv_in.weight.dtype)
-        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or
-                                x.shape[-2] > self.tile_sample_min_size):
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
             return self.tiled_encode(x, return_dict=return_dict)
 
         h = self.encoder(x)
@@ -176,57 +178,49 @@ def encode(self, x: paddle.Tensor,
         posterior = DiagonalGaussianDistribution(moments)
 
         if not return_dict:
-            return (posterior, )
+            return (posterior,)
 
         return AutoencoderKLOutput(latent_dist=posterior)
 
-    def _decode(self, z: paddle.Tensor,
-                return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]:
-        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or
-                                z.shape[-2] > self.tile_latent_min_size):
+    def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
             return self.tiled_decode(z, return_dict=return_dict)
 
         z = self.post_quant_conv(z)
         dec = self.decoder(z)
 
         if not return_dict:
-            return (dec, )
+            return (dec,)
 
         return DecoderOutput(sample=dec)
 
     @apply_forward_hook
-    def decode(self, z: paddle.Tensor,
-               return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]:
+    def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
         # TODO junnyu, add this to support pure fp16
         z = z.cast(self.post_quant_conv.weight.dtype)
         if self.use_slicing and z.shape[0] > 1:
             # split、chunk paddle vs pytorch may have some difference
-            decoded_slices = [
-                self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])
-            ]
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])]
             decoded = paddle.concat(decoded_slices)
         else:
             decoded = self._decode(z).sample
 
         if not return_dict:
-            return (decoded, )
+            return (decoded,)
 
         return DecoderOutput(sample=decoded)
 
     def blend_v(self, a, b, blend_extent):
         for y in range(min(a.shape[2], b.shape[2], blend_extent)):
-            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (
-                1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
         return b
 
     def blend_h(self, a, b, blend_extent):
         for x in range(min(a.shape[3], b.shape[3], blend_extent)):
-            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (
-                1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
         return b
 
-    def tiled_encode(self, x: paddle.Tensor,
-                     return_dict: bool=True) -> AutoencoderKLOutput:
+    def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
         r"""Encode a batch of images using a tiled encoder.
         Args:
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
@@ -237,8 +231,7 @@ def tiled_encode(self, x: paddle.Tensor,
             x (`paddle.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`AutoencoderKLOutput`] instead of a plain tuple.
         """
-        overlap_size = int(self.tile_sample_min_size *
-                           (1 - self.tile_overlap_factor))
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
         row_limit = self.tile_latent_min_size - blend_extent
 
@@ -247,8 +240,12 @@ def tiled_encode(self, x: paddle.Tensor,
         for i in range(0, x.shape[2], overlap_size):
             row = []
             for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i:i + self.tile_sample_min_size, j:j +
-                         self.tile_sample_min_size, ]
+                tile = x[
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_size,
+                    j : j + self.tile_sample_min_size,
+                ]
                 tile = self.encoder(tile)
                 tile = self.quant_conv(tile)
                 row.append(tile)
@@ -270,13 +267,11 @@ def tiled_encode(self, x: paddle.Tensor,
         posterior = DiagonalGaussianDistribution(moments)
 
         if not return_dict:
-            return (posterior, )
+            return (posterior,)
 
         return AutoencoderKLOutput(latent_dist=posterior)
 
-    def tiled_decode(
-            self, z: paddle.Tensor,
-            return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]:
+    def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
         r"""Decode a batch of images using a tiled decoder.
         Args:
         When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
@@ -288,8 +283,7 @@ def tiled_decode(
             `True`):
                 Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
         """
-        overlap_size = int(self.tile_latent_min_size *
-                           (1 - self.tile_overlap_factor))
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
         row_limit = self.tile_sample_min_size - blend_extent
 
@@ -299,8 +293,12 @@ def tiled_decode(
         for i in range(0, z.shape[2], overlap_size):
             row = []
             for j in range(0, z.shape[3], overlap_size):
-                tile = z[:, :, i:i + self.tile_latent_min_size, j:j +
-                         self.tile_latent_min_size, ]
+                tile = z[
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
                 tile = self.post_quant_conv(tile)
                 decoded = self.decoder(tile)
                 row.append(decoded)
@@ -320,17 +318,17 @@ def tiled_decode(
 
         dec = paddle.concat(result_rows, axis=2)
         if not return_dict:
-            return (dec, )
+            return (dec,)
 
         return DecoderOutput(sample=dec)
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            sample_posterior: bool=False,
-            return_dict: bool=True,
-            generator: Optional[paddle.Generator]=None, ) -> Union[
-                DecoderOutput, paddle.Tensor]:
+        self,
+        sample: paddle.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[paddle.Generator] = None,
+    ) -> Union[DecoderOutput, paddle.Tensor]:
         r"""
         Args:
             sample (`paddle.Tensor`): Input sample.
@@ -348,6 +346,6 @@ def forward(
         dec = self.decode(z).sample
 
         if not return_dict:
-            return (dec, )
+            return (dec,)
 
         return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/models/controlnet.py b/ppdiffusers/ppdiffusers/models/controlnet.py
index 6662f2904992c..2ac640f58f21e 100644
--- a/ppdiffusers/ppdiffusers/models/controlnet.py
+++ b/ppdiffusers/ppdiffusers/models/controlnet.py
@@ -25,8 +25,12 @@
 from .attention_processor import AttentionProcessor, AttnProcessor
 from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (CrossAttnDownBlock2D, DownBlock2D,
-                             UNetMidBlock2DCrossAttn, get_down_block)
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
 from .unet_2d_condition import UNet2DConditionModel
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -54,37 +58,31 @@ class ControlNetConditioningEmbedding(nn.Layer):
     """
 
     def __init__(
-            self,
-            conditioning_embedding_channels: int,
-            conditioning_channels: int=3,
-            block_out_channels: Tuple[int]=(16, 32, 96, 256), ):
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (16, 32, 96, 256),
+    ):
         super().__init__()
 
-        self.conv_in = nn.Conv2D(
-            conditioning_channels,
-            block_out_channels[0],
-            kernel_size=3,
-            padding=1)
+        self.conv_in = nn.Conv2D(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
 
         self.blocks = nn.LayerList([])
 
         for i in range(len(block_out_channels) - 1):
             channel_in = block_out_channels[i]
             channel_out = block_out_channels[i + 1]
-            self.blocks.append(
-                nn.Conv2D(
-                    channel_in, channel_in, kernel_size=3, padding=1))
-            self.blocks.append(
-                nn.Conv2D(
-                    channel_in, channel_out, kernel_size=3, padding=1,
-                    stride=2))
+            self.blocks.append(nn.Conv2D(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2D(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
 
         self.conv_out = zero_module(
             nn.Conv2D(
                 block_out_channels[-1],
                 conditioning_embedding_channels,
                 kernel_size=3,
-                padding=1, ))
+                padding=1,
+            )
+        )
 
     def forward(self, conditioning):
         embedding = self.conv_in(conditioning)
@@ -104,36 +102,37 @@ class ControlNetModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            in_channels: int=4,
-            flip_sin_to_cos: bool=True,
-            freq_shift: int=0,
-            down_block_types: Tuple[str]=(
-                "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D",
-                "DownBlock2D", ),
-            only_cross_attention: Union[bool, Tuple[bool]]=False,
-            block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
-            layers_per_block: int=2,
-            downsample_padding: int=1,
-            mid_block_scale_factor: float=1,
-            act_fn: str="silu",
-            norm_num_groups: Optional[int]=32,
-            norm_eps: float=1e-5,
-            cross_attention_dim: int=1280,
-            attention_head_dim: Union[int, Tuple[int]]=8,
-            use_linear_projection: bool=False,
-            class_embed_type: Optional[str]=None,
-            num_class_embeds: Optional[int]=None,
-            upcast_attention: bool=False,
-            resnet_time_scale_shift: str="default",
-            projection_class_embeddings_input_dim: Optional[int]=None,
-            controlnet_conditioning_channel_order: str="rgb",
-            conditioning_embedding_out_channels: Optional[Tuple[int]]=(16, 32,
-                                                                       96, 256),
-            global_pool_conditions: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
 
         # Check inputs
@@ -142,16 +141,12 @@ def __init__(
                 f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                only_cross_attention,
-                bool) and len(only_cross_attention) != len(down_block_types):
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                attention_head_dim,
-                int) and len(attention_head_dim) != len(down_block_types):
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
             )
@@ -163,27 +158,26 @@ def __init__(
             in_channels,
             block_out_channels[0],
             kernel_size=conv_in_kernel,
-            padding=conv_in_padding, )
+            padding=conv_in_padding,
+        )
 
         # time
         time_embed_dim = block_out_channels[0] * 4
 
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
-                                   freq_shift)
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
         timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(
             timestep_input_dim,
             time_embed_dim,
-            act_fn=act_fn, )
+            act_fn=act_fn,
+        )
 
         # class embedding
         if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds,
-                                                time_embed_dim)
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
         elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim,
-                                                     time_embed_dim)
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
         elif class_embed_type == "projection":
@@ -198,25 +192,24 @@ def __init__(
             # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
             # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(
-                projection_class_embeddings_input_dim, time_embed_dim)
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
 
         # control net conditioning embedding
         self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
             conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=conditioning_embedding_out_channels, )
+            block_out_channels=conditioning_embedding_out_channels,
+        )
 
         self.down_blocks = nn.LayerList([])
         self.controlnet_down_blocks = nn.LayerList([])
 
         if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(
-                down_block_types)
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
 
         if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
         # pre_temb_act_fun opt
         self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
@@ -233,8 +226,7 @@ def __init__(
         # down
         output_channel = block_out_channels[0]
 
-        controlnet_block = nn.Conv2D(
-            output_channel, output_channel, kernel_size=1)
+        controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
         controlnet_block = zero_module(controlnet_block)
         self.controlnet_down_blocks.append(controlnet_block)
 
@@ -260,27 +252,24 @@ def __init__(
                 only_cross_attention=only_cross_attention[i],
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_pre_temb_non_linearity=self.
-                resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity,
+            )
             self.down_blocks.append(down_block)
 
             for _ in range(layers_per_block):
-                controlnet_block = nn.Conv2D(
-                    output_channel, output_channel, kernel_size=1)
+                controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
                 controlnet_block = zero_module(controlnet_block)
                 self.controlnet_down_blocks.append(controlnet_block)
 
             if not is_final_block:
-                controlnet_block = nn.Conv2D(
-                    output_channel, output_channel, kernel_size=1)
+                controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
                 controlnet_block = zero_module(controlnet_block)
                 self.controlnet_down_blocks.append(controlnet_block)
 
         # mid
         mid_block_channel = block_out_channels[-1]
 
-        controlnet_block = nn.Conv2D(
-            mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = nn.Conv2D(mid_block_channel, mid_block_channel, kernel_size=1)
         controlnet_block = zero_module(controlnet_block)
         self.controlnet_mid_block = controlnet_block
 
@@ -296,16 +285,17 @@ def __init__(
             resnet_groups=norm_num_groups,
             use_linear_projection=use_linear_projection,
             upcast_attention=upcast_attention,
-            resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity,
+        )
 
     @classmethod
     def from_unet(
-            cls,
-            unet: UNet2DConditionModel,
-            controlnet_conditioning_channel_order: str="rgb",
-            conditioning_embedding_out_channels: Optional[Tuple[int]]=(16, 32,
-                                                                       96, 256),
-            load_weights_from_unet: bool=True, ):
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+    ):
         r"""
         Instantiate Controlnet class from UNet2DConditionModel.
         Parameters:
@@ -333,22 +323,19 @@ def from_unet(
             num_class_embeds=unet.config.num_class_embeds,
             upcast_attention=unet.config.upcast_attention,
             resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.
-            projection_class_embeddings_input_dim,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
             controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
             conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-            resnet_pre_temb_non_linearity=unet.config.
-            resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=unet.config.resnet_pre_temb_non_linearity,
+        )
 
         if load_weights_from_unet:
             controlnet.conv_in.load_dict(unet.conv_in.state_dict())
             controlnet.time_proj.load_dict(unet.time_proj.state_dict())
-            controlnet.time_embedding.load_dict(unet.time_embedding.state_dict(
-            ))
+            controlnet.time_embedding.load_dict(unet.time_embedding.state_dict())
 
             if controlnet.class_embedding:
-                controlnet.class_embedding.load_dict(
-                    unet.class_embedding.state_dict())
+                controlnet.class_embedding.load_dict(unet.class_embedding.state_dict())
 
             controlnet.down_blocks.load_dict(unet.down_blocks.state_dict())
             controlnet.mid_block.load_dict(unet.mid_block.state_dict())
@@ -365,16 +352,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         # set recursively
         processors = {}
 
-        def fn_recursive_add_processors(
-                name: str,
-                module: nn.Layer,
-                processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
             if hasattr(module, "set_processor"):
                 processors[f"{name}.processor"] = module.processor
 
             for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child,
-                                            processors)
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
 
             return processors
 
@@ -383,9 +366,7 @@ def fn_recursive_add_processors(
 
         return processors
 
-    def set_attn_processor(self,
-                           processor: Union[AttentionProcessor, Dict[
-                               str, AttentionProcessor]]):
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
         Parameters:
             `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -409,8 +390,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
                     module.set_processor(processor.pop(f"{name}.processor"))
 
             for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child,
-                                            processor)
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
 
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
@@ -457,8 +437,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             # make smallest slice possible
             slice_size = num_sliceable_layers * [1]
 
-        slice_size = (num_sliceable_layers * [slice_size]
-                      if not isinstance(slice_size, list) else slice_size)
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
 
         if len(slice_size) != len(sliceable_head_dims):
             raise ValueError(
@@ -470,14 +449,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             size = slice_size[i]
             dim = sliceable_head_dims[i]
             if size is not None and size > dim:
-                raise ValueError(
-                    f"size {size} has to be smaller or equal to {dim}.")
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
 
         # Recursively walk through all the children.
         # Any children which exposes the set_attention_slice method
         # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer,
-                                             slice_size: List[int]):
+        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
             if hasattr(module, "set_attention_slice"):
                 module.set_attention_slice(slice_size.pop())
 
@@ -493,18 +470,19 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            encoder_hidden_states: paddle.Tensor,
-            controlnet_cond: paddle.Tensor,
-            conditioning_scale: Union[List[float], float]=1.0,
-            class_labels: Optional[paddle.Tensor]=None,
-            timestep_cond: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            guess_mode: bool=False,
-            return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        controlnet_cond: paddle.Tensor,
+        conditioning_scale: Union[List[float], float] = 1.0,
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
         # TODO junnyu, add this to support pure fp16
         sample = sample.cast(self.dtype)
 
@@ -517,9 +495,7 @@ def forward(
         elif channel_order == "bgr":
             controlnet_cond = paddle.flip(controlnet_cond, axis=[1])
         else:
-            raise ValueError(
-                f"unknown `controlnet_conditioning_channel_order`: {channel_order}"
-            )
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
 
         # prepare attention_mask
         if attention_mask is not None:
@@ -534,7 +510,11 @@ def forward(
             timesteps = timesteps[None]
 
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand([sample.shape[0], ])
+        timesteps = timesteps.expand(
+            [
+                sample.shape[0],
+            ]
+        )
 
         t_emb = self.time_proj(timesteps)
 
@@ -547,8 +527,7 @@ def forward(
 
         if self.class_embedding is not None:
             if class_labels is None:
-                raise ValueError(
-                    "class_labels should be provided when num_class_embeds > 0")
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
 
             # maybe cast it to float16
             class_labels = class_labels.cast(self.dtype)
@@ -572,20 +551,19 @@ def forward(
         sample += controlnet_cond
 
         # 3. down
-        down_block_res_samples = (sample, )
+        down_block_res_samples = (sample,)
 
         for downsample_block in self.down_blocks:
-            if (hasattr(downsample_block, "has_cross_attention") and
-                    downsample_block.has_cross_attention):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                 sample, res_samples = downsample_block(
                     hidden_states=sample,
                     temb=emb,
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs, )
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
             else:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
 
@@ -596,16 +574,16 @@ def forward(
                 emb,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs, )
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
 
         # 5. Control net blocks
 
         controlnet_down_block_res_samples = ()
 
-        for down_block_res_sample, controlnet_block in zip(
-                down_block_res_samples, self.controlnet_down_blocks):
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
             down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples += (down_block_res_sample, )
+            controlnet_down_block_res_samples += (down_block_res_sample,)
 
         down_block_res_samples = controlnet_down_block_res_samples
 
@@ -613,45 +591,34 @@ def forward(
 
         # 6. scaling
         if guess_mode:
-            scales = paddle.logspace(
-                -1, 0, len(down_block_res_samples) + 1)  # 0.1 to 1.0
+            scales = paddle.logspace(-1, 0, len(down_block_res_samples) + 1)  # 0.1 to 1.0
             scales *= conditioning_scale
-            down_block_res_samples = [
-                sample * scale
-                for sample, scale in zip(down_block_res_samples, scales)
-            ]
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
             mid_block_res_sample *= scales[-1]  # last one
         else:
             # add conditioning_scale https://github.com/huggingface/diffusers/pull/2627
             if isinstance(conditioning_scale, (float, int)):
-                down_block_res_samples = [
-                    sample * conditioning_scale
-                    for sample in down_block_res_samples
-                ]
+                down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
                 mid_block_res_sample *= conditioning_scale
             else:
                 down_block_res_samples = [
-                    sample * ccs
-                    for sample, ccs in zip(down_block_res_samples,
-                                           conditioning_scale[:-1])
+                    sample * ccs for sample, ccs in zip(down_block_res_samples, conditioning_scale[:-1])
                 ]
                 mid_block_res_sample *= conditioning_scale[-1]
 
         if self.config.global_pool_conditions:
             down_block_res_samples = [
-                paddle.mean(
-                    sample, axis=(2, 3), keepdim=True)
-                for sample in down_block_res_samples
+                paddle.mean(sample, axis=(2, 3), keepdim=True) for sample in down_block_res_samples
             ]
-            mid_block_res_sample = paddle.mean(
-                mid_block_res_sample, axis=(2, 3), keepdim=True)
+            mid_block_res_sample = paddle.mean(mid_block_res_sample, axis=(2, 3), keepdim=True)
 
         if not return_dict:
             return (down_block_res_samples, mid_block_res_sample)
 
         return ControlNetOutput(
             down_block_res_samples=down_block_res_samples,
-            mid_block_res_sample=mid_block_res_sample, )
+            mid_block_res_sample=mid_block_res_sample,
+        )
 
 
 def zero_module(module):
diff --git a/ppdiffusers/ppdiffusers/models/cross_attention.py b/ppdiffusers/ppdiffusers/models/cross_attention.py
index 06660a99f385d..10911591e9f36 100644
--- a/ppdiffusers/ppdiffusers/models/cross_attention.py
+++ b/ppdiffusers/ppdiffusers/models/cross_attention.py
@@ -15,17 +15,21 @@
 from .attention_processor import AttentionProcessor  # noqa: F401
 from .attention_processor import AttnProcessor2_5  # noqa: F401
 from .attention_processor import Attention, AttnAddedKVProcessor
-from .attention_processor import \
-    AttnProcessor as AttnProcessorRename  # noqa: F401
+from .attention_processor import AttnProcessor as AttnProcessorRename  # noqa: F401
 from .attention_processor import (
-    LoRAAttnProcessor, LoRALinearLayer, LoRAXFormersAttnProcessor,
-    SlicedAttnAddedKVProcessor, SlicedAttnProcessor, XFormersAttnProcessor)
+    LoRAAttnProcessor,
+    LoRAXFormersAttnProcessor,
+    SlicedAttnAddedKVProcessor,
+    SlicedAttnProcessor,
+    XFormersAttnProcessor,
+)
 
 deprecate(
     "cross_attention",
     "0.18.0",
     "Importing from cross_attention is deprecated. Please import from diffusers.models.attention_processor instead.",
-    standard_warn=False, )
+    standard_warn=False,
+)
 
 AttnProcessor = AttentionProcessor
 
@@ -33,86 +37,54 @@
 class CrossAttention(Attention):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class CrossAttnProcessor(AttnProcessorRename):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class LoRACrossAttnProcessor(LoRAAttnProcessor):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class CrossAttnAddedKVProcessor(AttnAddedKVProcessor):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class XFormersCrossAttnProcessor(XFormersAttnProcessor):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class LoRAXFormersCrossAttnProcessor(LoRAXFormersAttnProcessor):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class SlicedCrossAttnProcessor(SlicedAttnProcessor):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
 
 
 class SlicedCrossAttnAddedKVProcessor(SlicedAttnAddedKVProcessor):
     def __init__(self, *args, **kwargs):
         deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate(
-            "cross_attention",
-            "0.18.0",
-            deprecation_message,
-            standard_warn=False)
+        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
         super().__init__(*args, **kwargs)
diff --git a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
index d1f6482176d0d..d6f680e81fc62 100644
--- a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
@@ -47,35 +47,40 @@ class DualTransformer2DModel(nn.Layer):
     """
 
     def __init__(
-            self,
-            num_attention_heads: int=16,
-            attention_head_dim: int=88,
-            in_channels: Optional[int]=None,
-            num_layers: int=1,
-            dropout: float=0.0,
-            norm_num_groups: int=32,
-            cross_attention_dim: Optional[int]=None,
-            attention_bias: bool=False,
-            sample_size: Optional[int]=None,
-            num_vector_embeds: Optional[int]=None,
-            activation_fn: str="geglu",
-            num_embeds_ada_norm: Optional[int]=None, ):
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
         super().__init__()
-        self.transformers = nn.LayerList([
-            Transformer2DModel(
-                num_attention_heads=num_attention_heads,
-                attention_head_dim=attention_head_dim,
-                in_channels=in_channels,
-                num_layers=num_layers,
-                dropout=dropout,
-                norm_num_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                attention_bias=attention_bias,
-                sample_size=sample_size,
-                num_vector_embeds=num_vector_embeds,
-                activation_fn=activation_fn,
-                num_embeds_ada_norm=num_embeds_ada_norm, ) for _ in range(2)
-        ])
+        self.transformers = nn.LayerList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
 
         # Variables that can be set by a pipeline:
 
@@ -91,13 +96,14 @@ def __init__(
         self.transformer_index_for_condition = [1, 0]
 
     def forward(
-            self,
-            hidden_states,
-            encoder_hidden_states,
-            timestep=None,
-            attention_mask=None,
-            cross_attention_kwargs=None,
-            return_dict: bool=True, ):
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        timestep=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+    ):
         """
         Args:
             hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
@@ -125,23 +131,22 @@ def forward(
         # attention_mask is not used yet
         for i in range(2):
             # for each of the two transformers, pass the corresponding condition tokens
-            condition_state = encoder_hidden_states[:, tokens_start:tokens_start
-                                                    + self.condition_lengths[i]]
+            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
             transformer_index = self.transformer_index_for_condition[i]
             encoded_state = self.transformers[transformer_index](
                 input_states,
                 encoder_hidden_states=condition_state,
                 timestep=timestep,
                 cross_attention_kwargs=cross_attention_kwargs,
-                return_dict=False, )[0]
+                return_dict=False,
+            )[0]
             encoded_states.append(encoded_state - input_states)
             tokens_start += self.condition_lengths[i]
 
-        output_states = encoded_states[0] * self.mix_ratio + encoded_states[
-            1] * (1 - self.mix_ratio)
+        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
         output_states = output_states + input_states
 
         if not return_dict:
-            return (output_states, )
+            return (output_states,)
 
         return Transformer2DModelOutput(sample=output_states)
diff --git a/ppdiffusers/ppdiffusers/models/ema.py b/ppdiffusers/ppdiffusers/models/ema.py
index 1d88a8a18c498..b42e0c2ad02ad 100644
--- a/ppdiffusers/ppdiffusers/models/ema.py
+++ b/ppdiffusers/ppdiffusers/models/ema.py
@@ -34,14 +34,11 @@ def __init__(self, model, decay=0.9999, use_num_upates=True):
             raise ValueError("Decay must be between 0 and 1")
 
         self.m_name2s_name = {}
-        self.register_buffer(
-            "decay", paddle.to_tensor(
-                decay, dtype=paddle.float32))
+        self.register_buffer("decay", paddle.to_tensor(decay, dtype=paddle.float32))
         self.register_buffer(
             "num_updates",
-            paddle.to_tensor(
-                0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor(
-                    -1, dtype=paddle.int64), )
+            paddle.to_tensor(0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor(-1, dtype=paddle.int64),
+        )
 
         for name, p in model.named_parameters():
             if not p.stop_gradient:
@@ -57,8 +54,7 @@ def forward(self, model):
 
         if self.num_updates >= 0:
             self.num_updates += 1
-            decay = min(self.decay,
-                        (1 + self.num_updates) / (10 + self.num_updates))
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
 
         one_minus_decay = 1.0 - decay
 
@@ -79,8 +75,7 @@ def copy_to(self, model):
         shadow_params = dict(self.named_buffers())
         for key in m_param:
             if not m_param[key].stop_gradient:
-                m_param[key].copy_(shadow_params[self.m_name2s_name[key]],
-                                   False)
+                m_param[key].copy_(shadow_params[self.m_name2s_name[key]], False)
             else:
                 assert key not in self.m_name2s_name
 
@@ -91,9 +86,7 @@ def store(self, parameters):
           parameters: Iterable of `EagerParamBase`; the parameters to be
             temporarily stored.
         """
-        self.collected_params = [
-            param.detach().cpu().clone() for param in parameters
-        ]
+        self.collected_params = [param.detach().cpu().clone() for param in parameters]
 
     def restore(self, parameters):
         """
diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index 9527cf3ae055b..4c38ff3d44a98 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -21,12 +21,13 @@
 
 
 def get_timestep_embedding(
-        timesteps: paddle.Tensor,
-        embedding_dim: int,
-        flip_sin_to_cos: bool=False,
-        downscale_freq_shift: float=1,
-        scale: float=1,
-        max_period: int=10000, ):
+    timesteps: paddle.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
     """
     This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
 
@@ -38,8 +39,7 @@ def get_timestep_embedding(
     assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
 
     half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * paddle.arange(
-        start=0, end=half_dim, dtype="float32")
+    exponent = -math.log(max_period) * paddle.arange(start=0, end=half_dim, dtype="float32")
 
     exponent = exponent / (half_dim - downscale_freq_shift)
 
@@ -62,10 +62,7 @@ def get_timestep_embedding(
     return emb
 
 
-def get_2d_sincos_pos_embed(embed_dim,
-                            grid_size,
-                            cls_token=False,
-                            extra_tokens=0):
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
     """
     grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
     [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
@@ -78,8 +75,7 @@ def get_2d_sincos_pos_embed(embed_dim,
     grid = grid.reshape([2, 1, grid_size, grid_size])
     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
     if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate(
-            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
     return pos_embed
 
 
@@ -88,10 +84,8 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
         raise ValueError("embed_dim must be divisible by 2")
 
     # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2,
-                                              grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2,
-                                              grid[1])  # (H*W, D/2)
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
 
     emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
     return emb
@@ -122,16 +116,17 @@ class PatchEmbed(nn.Layer):
     """2D Image to Patch Embedding"""
 
     def __init__(
-            self,
-            height=224,
-            width=224,
-            patch_size=16,
-            in_channels=3,
-            embed_dim=768,
-            layer_norm=False,
-            flatten=True,
-            bias=True,
-            add_pos_embed=True, ):
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+        add_pos_embed=True,
+    ):
         super().__init__()
 
         num_patches = (height // patch_size) * (width // patch_size)
@@ -143,22 +138,22 @@ def __init__(
             embed_dim,
             kernel_size=(patch_size, patch_size),
             stride=patch_size,
-            bias_attr=bias, )
+            bias_attr=bias,
+        )
         if layer_norm:
             # elementwise_affine=False  -> weight_attr=False, bias_attr=False
-            self.norm = nn.LayerNorm(
-                embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False)
+            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False)
         else:
             self.norm = None
 
         self.add_pos_embed = add_pos_embed
         if add_pos_embed:
-            pos_embed = get_2d_sincos_pos_embed(embed_dim,
-                                                int(num_patches**0.5))
+            pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
             self.register_buffer(
                 "pos_embed",
                 paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0),
-                persistable=False, )
+                persistable=False,
+            )
 
     def forward(self, latent):
         latent = self.proj(latent)
@@ -174,20 +169,20 @@ def forward(self, latent):
 
 class TimestepEmbedding(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            time_embed_dim: int,
-            act_fn: str="silu",
-            out_dim: int=None,
-            post_act_fn: Optional[str]=None,
-            cond_proj_dim=None, ):
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
         super().__init__()
 
         self.linear_1 = nn.Linear(in_channels, time_embed_dim)
 
         if cond_proj_dim is not None:
-            self.cond_proj = nn.Linear(
-                cond_proj_dim, in_channels, bias_attr=False)
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias_attr=False)
         else:
             self.cond_proj = None
 
@@ -198,9 +193,7 @@ def __init__(
         elif act_fn == "gelu":
             self.act = nn.GELU()
         else:
-            raise ValueError(
-                f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'"
-            )
+            raise ValueError(f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
 
         if out_dim is not None:
             time_embed_dim_out = out_dim
@@ -217,9 +210,7 @@ def __init__(
         elif post_act_fn == "gelu":
             self.post_act = nn.GELU()
         else:
-            raise ValueError(
-                f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'"
-            )
+            raise ValueError(f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
 
     def forward(self, sample, condition=None):
         if condition is not None:
@@ -237,10 +228,7 @@ def forward(self, sample, condition=None):
 
 
 class Timesteps(nn.Layer):
-    def __init__(self,
-                 num_channels: int,
-                 flip_sin_to_cos: bool,
-                 downscale_freq_shift: float):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
         super().__init__()
         self.num_channels = num_channels
         self.flip_sin_to_cos = flip_sin_to_cos
@@ -251,7 +239,8 @@ def forward(self, timesteps):
             timesteps,
             self.num_channels,
             flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift, )
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
         return t_emb
 
 
@@ -259,20 +248,21 @@ class GaussianFourierProjection(nn.Layer):
     """Gaussian Fourier embeddings for noise levels."""
 
     def __init__(
-            self,
-            embedding_size: int=256,
-            scale: float=1.0,
-            set_W_to_weight=True,
-            log=True,
-            flip_sin_to_cos=False, ):
+        self,
+        embedding_size: int = 256,
+        scale: float = 1.0,
+        set_W_to_weight=True,
+        log=True,
+        flip_sin_to_cos=False,
+    ):
         super().__init__()
-        self.register_buffer("weight", paddle.randn((embedding_size, )) * scale)
+        self.register_buffer("weight", paddle.randn((embedding_size,)) * scale)
         self.log = log
         self.flip_sin_to_cos = flip_sin_to_cos
 
         if set_W_to_weight:
             # to delete later
-            self.register_buffer("W", paddle.randn((embedding_size, )) * scale)
+            self.register_buffer("W", paddle.randn((embedding_size,)) * scale)
 
             self.weight = self.W
 
@@ -285,11 +275,9 @@ def forward(self, x):
         x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
 
         if self.flip_sin_to_cos:
-            out = paddle.concat(
-                [paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1)
+            out = paddle.concat([paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1)
         else:
-            out = paddle.concat(
-                [paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1)
+            out = paddle.concat([paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1)
         return out
 
 
@@ -318,11 +306,12 @@ class ImagePositionalEmbeddings(nn.Layer):
     """
 
     def __init__(
-            self,
-            num_embed: int,
-            height: int,
-            width: int,
-            embed_dim: int, ):
+        self,
+        num_embed: int,
+        height: int,
+        width: int,
+        embed_dim: int,
+    ):
         super().__init__()
 
         self.height = height
@@ -337,14 +326,12 @@ def __init__(
     def forward(self, index):
         emb = self.emb(index)
 
-        height_emb = self.height_emb(
-            paddle.arange(self.height).reshape([1, self.height]))
+        height_emb = self.height_emb(paddle.arange(self.height).reshape([1, self.height]))
 
         # 1 x H x D -> 1 x H x 1 x D
         height_emb = height_emb.unsqueeze(2)
 
-        width_emb = self.width_emb(
-            paddle.arange(self.width).reshape([1, self.width]))
+        width_emb = self.width_emb(paddle.arange(self.width).reshape([1, self.width]))
 
         # 1 x W x D -> 1 x 1 x W x D
         width_emb = width_emb.unsqueeze(1)
@@ -354,7 +341,7 @@ def forward(self, index):
         # 1 x H x W x D -> 1 x L xD
         pos_emb = pos_emb.reshape([1, self.height * self.width, -1])
 
-        emb = emb + pos_emb[:, :emb.shape[1], :]
+        emb = emb + pos_emb[:, : emb.shape[1], :]
 
         return emb
 
@@ -372,8 +359,7 @@ class LabelEmbedding(nn.Layer):
     def __init__(self, num_classes, hidden_size, dropout_prob):
         super().__init__()
         use_cfg_embedding = dropout_prob > 0
-        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding,
-                                            hidden_size)
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
         self.num_classes = num_classes
         self.dropout_prob = dropout_prob
 
@@ -382,7 +368,12 @@ def token_drop(self, labels, force_drop_ids=None):
         Drops labels to enable classifier-free guidance.
         """
         if force_drop_ids is None:
-            drop_ids = (paddle.rand((labels.shape[0], ), ) < self.dropout_prob)
+            drop_ids = (
+                paddle.rand(
+                    (labels.shape[0],),
+                )
+                < self.dropout_prob
+            )
         else:
             drop_ids = paddle.to_tensor(force_drop_ids == 1)
         labels = paddle.where(drop_ids, self.num_classes, labels)
@@ -400,17 +391,13 @@ class CombinedTimestepLabelEmbeddings(nn.Layer):
     def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
         super().__init__()
 
-        self.time_proj = Timesteps(
-            num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
-        self.timestep_embedder = TimestepEmbedding(
-            in_channels=256, time_embed_dim=embedding_dim)
-        self.class_embedder = LabelEmbedding(num_classes, embedding_dim,
-                                             class_dropout_prob)
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
 
     def forward(self, timestep, class_labels, hidden_dtype=None):
         timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(
-            timesteps_proj.cast(hidden_dtype))  # (N, D)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.cast(hidden_dtype))  # (N, D)
 
         class_labels = self.class_embedder(class_labels)  # (N, D)
 
@@ -420,8 +407,7 @@ def forward(self, timestep, class_labels, hidden_dtype=None):
 
 
 class TextTimeEmbedding(nn.Layer):
-    def __init__(self, encoder_dim: int, time_embed_dim: int,
-                 num_heads: int=64):
+    def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
         super().__init__()
         self.norm1 = nn.LayerNorm(encoder_dim)
         self.pool = AttentionPooling(num_heads, encoder_dim)
@@ -443,8 +429,8 @@ def __init__(self, num_heads, embed_dim, dtype=None):
         super().__init__()
         self.positional_embedding = self.create_parameter(
             (1, embed_dim),
-            default_initializer=nn.initializer.Assign(
-                paddle.randn((1, embed_dim)) / embed_dim**0.5), )
+            default_initializer=nn.initializer.Assign(paddle.randn((1, embed_dim)) / embed_dim**0.5),
+        )
         self.k_proj = nn.Linear(embed_dim, embed_dim)
         self.q_proj = nn.Linear(embed_dim, embed_dim)
         self.v_proj = nn.Linear(embed_dim, embed_dim)
@@ -466,8 +452,7 @@ def shape(x):
             x = x.transpose([0, 2, 1])
             return x
 
-        class_token = x.mean(
-            axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype)
+        class_token = x.mean(axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype)
         x = paddle.concat([class_token, x], axis=1)  # (bs, length+1, width)
 
         # (bs*n_heads, class_token_length, dim_per_head)
@@ -478,10 +463,9 @@ def shape(x):
 
         # (bs*n_heads, class_token_length, length+class_token_length):
         weight = paddle.einsum(
-            "bct,bcs->bts", q * self.scale,
-            k * self.scale)  # More stable with f16 than dividing afterwards
-        weight = nn.functional.softmax(
-            weight.cast("float32"), axis=-1).cast(weight.dtype)
+            "bct,bcs->bts", q * self.scale, k * self.scale
+        )  # More stable with f16 than dividing afterwards
+        weight = nn.functional.softmax(weight.cast("float32"), axis=-1).cast(weight.dtype)
 
         # (bs*n_heads, dim_per_head, class_token_length)
         a = paddle.einsum("bts,bcs->bct", weight, v)
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py b/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py
index 192173d39afdf..d3a3befd29063 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py
@@ -39,8 +39,9 @@ def hinge_d_loss(logits_real, logits_fake):
 
 def vanilla_d_loss(logits_real, logits_fake):
     d_loss = 0.5 * (
-        paddle.mean(x=paddle.nn.functional.softplus(x=-logits_real)) +
-        paddle.mean(x=paddle.nn.functional.softplus(x=logits_fake)))
+        paddle.mean(x=paddle.nn.functional.softplus(x=-logits_real))
+        + paddle.mean(x=paddle.nn.functional.softplus(x=logits_fake))
+    )
     return d_loss
 
 
@@ -52,41 +53,34 @@ def Normalize(in_channels, norm_type="group"):
             num_channels=in_channels,
             epsilon=1e-06,
             weight_attr=None,
-            bias_attr=None, )
+            bias_attr=None,
+        )
     elif norm_type == "batch":
         return paddle.nn.SyncBatchNorm(in_channels)
 
 
 class ResBlock(paddle.nn.Layer):
     def __init__(
-            self,
-            in_channels,
-            out_channels=None,
-            conv_shortcut=False,
-            dropout=0.0,
-            norm_type="group",
-            padding_type="replicate", ):
+        self,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        norm_type="group",
+        padding_type="replicate",
+    ):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
         self.out_channels = out_channels
         self.use_conv_shortcut = conv_shortcut
         self.norm1 = Normalize(in_channels, norm_type)
-        self.conv1 = SamePadConv3d(
-            in_channels, out_channels, kernel_size=3, padding_type=padding_type)
+        self.conv1 = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type)
         self.dropout = paddle.nn.Dropout(p=dropout)
         self.norm2 = Normalize(in_channels, norm_type)
-        self.conv2 = SamePadConv3d(
-            out_channels,
-            out_channels,
-            kernel_size=3,
-            padding_type=padding_type)
+        self.conv2 = SamePadConv3d(out_channels, out_channels, kernel_size=3, padding_type=padding_type)
         if self.in_channels != self.out_channels:
-            self.conv_shortcut = SamePadConv3d(
-                in_channels,
-                out_channels,
-                kernel_size=3,
-                padding_type=padding_type)
+            self.conv_shortcut = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type)
 
     def forward(self, x):
         h = x
@@ -103,18 +97,19 @@ def forward(self, x):
 
 class SamePadConv3d(paddle.nn.Layer):
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            bias=True,
-            padding_type="replicate", ):
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        bias=True,
+        padding_type="replicate",
+    ):
         super().__init__()
         if isinstance(kernel_size, int):
-            kernel_size = (kernel_size, ) * 3
+            kernel_size = (kernel_size,) * 3
         if isinstance(stride, int):
-            stride = (stride, ) * 3
+            stride = (stride,) * 3
         total_pad = tuple([(k - s) for k, s in zip(kernel_size, stride)])
         pad_input = []
         for p in total_pad[::-1]:
@@ -128,31 +123,31 @@ def __init__(
             kernel_size=kernel_size,
             stride=stride,
             padding=0,
-            bias_attr=bias, )
+            bias_attr=bias,
+        )
         self.weight = self.conv.weight
 
     def forward(self, x):
         return self.conv(
-            paddle.nn.functional.pad(x=x,
-                                     pad=self.pad_input,
-                                     mode=self.padding_type,
-                                     data_format="NCDHW"))
+            paddle.nn.functional.pad(x=x, pad=self.pad_input, mode=self.padding_type, data_format="NCDHW")
+        )
 
 
 class SamePadConvTranspose3d(paddle.nn.Layer):
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            bias=True,
-            padding_type="replicate", ):
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        bias=True,
+        padding_type="replicate",
+    ):
         super().__init__()
         if isinstance(kernel_size, int):
-            kernel_size = (kernel_size, ) * 3
+            kernel_size = (kernel_size,) * 3
         if isinstance(stride, int):
-            stride = (stride, ) * 3
+            stride = (stride,) * 3
         total_pad = tuple([(k - s) for k, s in zip(kernel_size, stride)])
         pad_input = []
         for p in total_pad[::-1]:
@@ -166,45 +161,38 @@ def __init__(
             kernel_size=kernel_size,
             stride=stride,
             padding=tuple([(k - 1) for k in kernel_size]),
-            bias_attr=bias, )
+            bias_attr=bias,
+        )
 
     def forward(self, x):
         return self.convt(
-            paddle.nn.functional.pad(x=x,
-                                     pad=self.pad_input,
-                                     mode=self.padding_type,
-                                     data_format="NCDHW"))
+            paddle.nn.functional.pad(x=x, pad=self.pad_input, mode=self.padding_type, data_format="NCDHW")
+        )
 
 
 class Encoder(paddle.nn.Layer):
     def __init__(
-            self,
-            n_hiddens,
-            downsample,
-            z_channels,
-            double_z,
-            image_channel=3,
-            norm_type="group",
-            padding_type="replicate", ):
+        self,
+        n_hiddens,
+        downsample,
+        z_channels,
+        double_z,
+        image_channel=3,
+        norm_type="group",
+        padding_type="replicate",
+    ):
         super().__init__()
         n_times_downsample = np.array([int(math.log2(d)) for d in downsample])
         self.conv_blocks = paddle.nn.LayerList()
         max_ds = n_times_downsample.max()
-        self.conv_first = SamePadConv3d(
-            image_channel, n_hiddens, kernel_size=3, padding_type=padding_type)
+        self.conv_first = SamePadConv3d(image_channel, n_hiddens, kernel_size=3, padding_type=padding_type)
         for i in range(max_ds):
             block = paddle.nn.Layer()
             in_channels = n_hiddens * 2**i
-            out_channels = n_hiddens * 2**(i + 1)
+            out_channels = n_hiddens * 2 ** (i + 1)
             stride = tuple([(2 if d > 0 else 1) for d in n_times_downsample])
-            block.down = SamePadConv3d(
-                in_channels,
-                out_channels,
-                4,
-                stride=stride,
-                padding_type=padding_type)
-            block.res = ResBlock(
-                out_channels, out_channels, norm_type=norm_type)
+            block.down = SamePadConv3d(in_channels, out_channels, 4, stride=stride, padding_type=padding_type)
+            block.res = ResBlock(out_channels, out_channels, norm_type=norm_type)
             self.conv_blocks.append(block)
             n_times_downsample -= 1
         self.final_block = paddle.nn.Sequential(
@@ -215,7 +203,9 @@ def __init__(
                 2 * z_channels if double_z else z_channels,
                 kernel_size=3,
                 stride=1,
-                padding_type=padding_type, ), )
+                padding_type=padding_type,
+            ),
+        )
         self.out_channels = out_channels
 
     def forward(self, x):
@@ -228,12 +218,7 @@ def forward(self, x):
 
 
 class Decoder(paddle.nn.Layer):
-    def __init__(self,
-                 n_hiddens,
-                 upsample,
-                 z_channels,
-                 image_channel,
-                 norm_type="group"):
+    def __init__(self, n_hiddens, upsample, z_channels, image_channel, norm_type="group"):
         super().__init__()
         n_times_upsample = np.array([int(math.log2(d)) for d in upsample])
         max_us = n_times_upsample.max()
@@ -241,20 +226,15 @@ def __init__(self,
         self.conv_blocks = paddle.nn.LayerList()
         for i in range(max_us):
             block = paddle.nn.Layer()
-            in_channels = in_channels if i == 0 else n_hiddens * 2**(
-                max_us - i + 1)
-            out_channels = n_hiddens * 2**(max_us - i)
+            in_channels = in_channels if i == 0 else n_hiddens * 2 ** (max_us - i + 1)
+            out_channels = n_hiddens * 2 ** (max_us - i)
             us = tuple([(2 if d > 0 else 1) for d in n_times_upsample])
-            block.up = SamePadConvTranspose3d(
-                in_channels, out_channels, 4, stride=us)
-            block.res1 = ResBlock(
-                out_channels, out_channels, norm_type=norm_type)
-            block.res2 = ResBlock(
-                out_channels, out_channels, norm_type=norm_type)
+            block.up = SamePadConvTranspose3d(in_channels, out_channels, 4, stride=us)
+            block.res1 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            block.res2 = ResBlock(out_channels, out_channels, norm_type=norm_type)
             self.conv_blocks.append(block)
             n_times_upsample -= 1
-        self.conv_out = SamePadConv3d(
-            out_channels, image_channel, kernel_size=3)
+        self.conv_out = SamePadConv3d(out_channels, image_channel, kernel_size=3)
 
     def forward(self, x):
         h = x
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py b/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py
index 7a934de7f6224..acc73c41c8fdd 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py
@@ -17,8 +17,9 @@
 from paddle.distributed.fleet.utils import recompute
 
 try:
-    from paddle.incubate.nn.memory_efficient_attention import \
-        memory_efficient_attention  # noqa
+    from paddle.incubate.nn.memory_efficient_attention import (  # noqa
+        memory_efficient_attention,
+    )
 
     _ppxformers_available = True
 except:
@@ -30,8 +31,15 @@
 from einops import rearrange, repeat
 
 from ..utils.initializer_utils import constant_, xavier_uniform_
-from .lvdm_util import (GEGLU, Normalize, conv_nd, default, exists,
-                        normalization, zero_module)
+from .lvdm_util import (
+    GEGLU,
+    Normalize,
+    conv_nd,
+    default,
+    exists,
+    normalization,
+    zero_module,
+)
 
 
 def finfo(dtype):
@@ -53,15 +61,19 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
-        project_in = (paddle.nn.Sequential(
-            paddle.nn.Linear(
-                in_features=dim, out_features=inner_dim),
-            paddle.nn.GELU(), ) if not glu else GEGLU(dim, inner_dim))
+        project_in = (
+            paddle.nn.Sequential(
+                paddle.nn.Linear(in_features=dim, out_features=inner_dim),
+                paddle.nn.GELU(),
+            )
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
         self.net = paddle.nn.Sequential(
             project_in,
             paddle.nn.Dropout(p=dropout),
-            paddle.nn.Linear(
-                in_features=inner_dim, out_features=dim_out), )
+            paddle.nn.Linear(in_features=inner_dim, out_features=dim_out),
+        )
 
     def forward(self, x):
         return self.net(x)
@@ -74,19 +86,19 @@ def __init__(self, num_units, max_relative_position):
         super().__init__()
         self.num_units = num_units
         self.max_relative_position = max_relative_position
-        self.embeddings_table = paddle.nn.Parameter(
-            paddle.empty(shape=[max_relative_position * 2 + 1, num_units]))
+        self.embeddings_table = paddle.nn.Parameter(paddle.empty(shape=[max_relative_position * 2 + 1, num_units]))
         xavier_uniform_(self.embeddings_table)
 
     def forward(self, length_q, length_k):
-        device = self.embeddings_table.place
+        # device = self.embeddings_table.place
         range_vec_q = paddle.arange(end=length_q)
         range_vec_k = paddle.arange(end=length_k)
         distance_mat = range_vec_k[(None), :] - range_vec_q[:, (None)]
         distance_mat_clipped = paddle.clip(
             x=distance_mat,
             min=-self.max_relative_position,
-            max=self.max_relative_position, )
+            max=self.max_relative_position,
+        )
         final_mat = distance_mat_clipped + self.max_relative_position
         final_mat = final_mat.astype(dtype="int64")
         embeddings = self.embeddings_table[final_mat]
@@ -95,15 +107,16 @@ def forward(self, length_q, length_k):
 
 class TemporalCrossAttention(paddle.nn.Layer):
     def __init__(
-            self,
-            query_dim,
-            context_dim=None,
-            heads=8,
-            dim_head=64,
-            dropout=0.0,
-            use_relative_position=False,
-            temporal_length=None,
-            **kwargs, ):
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        use_relative_position=False,
+        temporal_length=None,
+        **kwargs,
+    ):
         super().__init__()
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
@@ -112,22 +125,17 @@ def __init__(
         self.heads = heads
         self.temporal_length = temporal_length
         self.use_relative_position = use_relative_position
-        self.to_q = paddle.nn.Linear(
-            in_features=query_dim, out_features=inner_dim, bias_attr=False)
-        self.to_k = paddle.nn.Linear(
-            in_features=context_dim, out_features=inner_dim, bias_attr=False)
-        self.to_v = paddle.nn.Linear(
-            in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+        self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
         self.to_out = paddle.nn.Sequential(
-            paddle.nn.Linear(
-                in_features=inner_dim, out_features=query_dim),
-            paddle.nn.Dropout(p=dropout), )
+            paddle.nn.Linear(in_features=inner_dim, out_features=query_dim),
+            paddle.nn.Dropout(p=dropout),
+        )
         if use_relative_position:
             assert temporal_length is not None
-            self.relative_position_k = RelativePosition(
-                num_units=dim_head, max_relative_position=temporal_length)
-            self.relative_position_v = RelativePosition(
-                num_units=dim_head, max_relative_position=temporal_length)
+            self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+            self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
         constant_(self.to_q.weight, 0)
         constant_(self.to_k.weight, 0)
         constant_(self.to_v.weight, 0)
@@ -162,32 +170,23 @@ def forward(self, x, context=None, mask=None):
 
 
 class CrossAttention(paddle.nn.Layer):
-    def __init__(self,
-                 query_dim,
-                 context_dim=None,
-                 heads=8,
-                 dim_head=64,
-                 dropout=0.0,
-                 **kwargs):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs):
         super().__init__()
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
         self.scale = dim_head**-0.5
         self.heads = heads
-        self.to_q = paddle.nn.Linear(
-            in_features=query_dim, out_features=inner_dim, bias_attr=False)
-        self.to_k = paddle.nn.Linear(
-            in_features=context_dim, out_features=inner_dim, bias_attr=False)
-        self.to_v = paddle.nn.Linear(
-            in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+        self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
         self.to_out = paddle.nn.Sequential(
-            paddle.nn.Linear(
-                in_features=inner_dim, out_features=query_dim),
-            paddle.nn.Dropout(p=dropout), )
+            paddle.nn.Linear(in_features=inner_dim, out_features=query_dim),
+            paddle.nn.Dropout(p=dropout),
+        )
 
     def forward(self, x, context=None, mask=None):
         h = self.heads
-        b = x.shape[0]
+        # b = x.shape[0]
         q = self.to_q(x)
         context = default(context, x)
         k = self.to_k(context)
@@ -206,13 +205,7 @@ def forward(self, x, context=None, mask=None):
 
 
 class MemoryEfficientCrossAttention(paddle.nn.Layer):
-    def __init__(self,
-                 query_dim,
-                 context_dim=None,
-                 heads=8,
-                 dim_head=64,
-                 dropout=0.0,
-                 **kwargs):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs):
         super().__init__()
         print(
             f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using {heads} heads."
@@ -221,16 +214,13 @@ def __init__(self,
         context_dim = default(context_dim, query_dim)
         self.heads = heads
         self.dim_head = dim_head
-        self.to_q = paddle.nn.Linear(
-            in_features=query_dim, out_features=inner_dim, bias_attr=False)
-        self.to_k = paddle.nn.Linear(
-            in_features=context_dim, out_features=inner_dim, bias_attr=False)
-        self.to_v = paddle.nn.Linear(
-            in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+        self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
         self.to_out = paddle.nn.Sequential(
-            paddle.nn.Linear(
-                in_features=inner_dim, out_features=query_dim),
-            paddle.nn.Dropout(p=dropout), )
+            paddle.nn.Linear(in_features=inner_dim, out_features=query_dim),
+            paddle.nn.Dropout(p=dropout),
+        )
         self.attention_op = "cutlass"
 
     def forward(self, x, context=None, mask=None):
@@ -239,8 +229,7 @@ def forward(self, x, context=None, mask=None):
         k = self.to_k(context)
         v = self.to_v(context)
         b, _, _ = q.shape
-        q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]),
-                      (q, k, v))
+        q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]), (q, k, v))
         out = F.scaled_dot_product_attention_(
             q,
             k,
@@ -248,7 +237,8 @@ def forward(self, x, context=None, mask=None):
             attn_mask=None,
             dropout_p=0.0,
             attention_op=self.attention_op,
-            training=True, )
+            training=True,
+        )
         if exists(mask):
             raise NotImplementedError
         out = out.reshape([0, 0, self.heads * self.dim_head])
@@ -261,63 +251,46 @@ class BasicTransformerBlockST(paddle.nn.Layer):
     """
 
     def __init__(
-            self,
-            dim,
-            n_heads,
-            d_head,
-            dropout=0.0,
-            context_dim=None,
-            gated_ff=True,
-            checkpoint=True,
-            temporal_length=None,
-            use_relative_position=True,
-            **kwargs, ):
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        temporal_length=None,
+        use_relative_position=True,
+        **kwargs,
+    ):
         super().__init__()
         if _ppxformers_available:
             self.attn1 = MemoryEfficientCrossAttention(
-                query_dim=dim,
-                heads=n_heads,
-                dim_head=d_head,
-                dropout=dropout,
-                **kwargs)
+                query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs
+            )
             self.attn2 = MemoryEfficientCrossAttention(
                 query_dim=dim,
                 context_dim=context_dim,
                 heads=n_heads,
                 dim_head=d_head,
                 dropout=dropout,
-                **kwargs, )
+                **kwargs,
+            )
         else:
-            self.attn1 = CrossAttention(
-                query_dim=dim,
-                heads=n_heads,
-                dim_head=d_head,
-                dropout=dropout,
-                **kwargs)
+            self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs)
             self.attn2 = CrossAttention(
                 query_dim=dim,
                 context_dim=context_dim,
                 heads=n_heads,
                 dim_head=d_head,
                 dropout=dropout,
-                **kwargs, )
+                **kwargs,
+            )
 
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.norm1 = paddle.nn.LayerNorm(
-            normalized_shape=dim,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None)
-        self.norm2 = paddle.nn.LayerNorm(
-            normalized_shape=dim,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None)
-        self.norm3 = paddle.nn.LayerNorm(
-            normalized_shape=dim,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None)
+        self.norm1 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.norm2 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.norm3 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
         self.checkpoint = checkpoint
         self.attn1_tmp = TemporalCrossAttention(
             query_dim=dim,
@@ -326,7 +299,8 @@ def __init__(
             dropout=dropout,
             temporal_length=temporal_length,
             use_relative_position=use_relative_position,
-            **kwargs, )
+            **kwargs,
+        )
         self.attn2_tmp = TemporalCrossAttention(
             query_dim=dim,
             heads=n_heads,
@@ -335,17 +309,10 @@ def __init__(
             context_dim=None,
             temporal_length=temporal_length,
             use_relative_position=use_relative_position,
-            **kwargs, )
-        self.norm4 = paddle.nn.LayerNorm(
-            normalized_shape=dim,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None)
-        self.norm5 = paddle.nn.LayerNorm(
-            normalized_shape=dim,
-            epsilon=1e-05,
-            weight_attr=None,
-            bias_attr=None)
+            **kwargs,
+        )
+        self.norm4 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.norm5 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
 
     def forward(self, x, context=None, **kwargs):
         if self.checkpoint:
@@ -366,8 +333,7 @@ def _forward(self, x, context=None, mask=None):
         if context is not None:
             context_ = []
             for i in range(context.shape[0]):
-                context_.append(context[i].unsqueeze(axis=0).tile(
-                    repeat_times=[t, 1, 1]))
+                context_.append(context[i].unsqueeze(axis=0).tile(repeat_times=[t, 1, 1]))
             context_ = paddle.concat(x=context_, axis=0)
         else:
             context_ = None
@@ -389,16 +355,17 @@ class SpatialTemporalTransformer(paddle.nn.Layer):
     """
 
     def __init__(
-            self,
-            in_channels,
-            n_heads,
-            d_head,
-            depth=1,
-            dropout=0.0,
-            context_dim=None,
-            temporal_length=None,
-            use_relative_position=True,
-            **kwargs, ):
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        temporal_length=None,
+        use_relative_position=True,
+        **kwargs,
+    ):
         super().__init__()
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
@@ -408,25 +375,32 @@ def __init__(
             out_channels=inner_dim,
             kernel_size=1,
             stride=1,
-            padding=0, )
-        self.transformer_blocks = paddle.nn.LayerList(sublayers=[
-            BasicTransformerBlockST(
-                inner_dim,
-                n_heads,
-                d_head,
-                dropout=dropout,
-                context_dim=context_dim,
-                temporal_length=temporal_length,
-                use_relative_position=use_relative_position,
-                **kwargs, ) for d in range(depth)
-        ])
+            padding=0,
+        )
+        self.transformer_blocks = paddle.nn.LayerList(
+            sublayers=[
+                BasicTransformerBlockST(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    temporal_length=temporal_length,
+                    use_relative_position=use_relative_position,
+                    **kwargs,
+                )
+                for d in range(depth)
+            ]
+        )
         self.proj_out = zero_module(
             paddle.nn.Conv3D(
                 in_channels=inner_dim,
                 out_channels=in_channels,
                 kernel_size=1,
                 stride=1,
-                padding=0, ))
+                padding=0,
+            )
+        )
 
     def forward(self, x, context=None, **kwargs):
         assert x.dim() == 5, f"x shape = {x.shape}"
@@ -441,13 +415,14 @@ def forward(self, x, context=None, **kwargs):
 
 class STAttentionBlock(paddle.nn.Layer):
     def __init__(
-            self,
-            channels,
-            num_heads=1,
-            num_head_channels=-1,
-            use_checkpoint=False,
-            temporal_length=16,
-            use_relative_position=False, ):
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        temporal_length=16,
+        use_relative_position=False,
+    ):
         super().__init__()
         if num_head_channels == -1:
             self.num_heads = num_heads
@@ -468,10 +443,12 @@ def __init__(
         if use_relative_position:
             self.relative_position_k = RelativePosition(
                 num_units=channels // self.num_heads,
-                max_relative_position=temporal_length, )
+                max_relative_position=temporal_length,
+            )
             self.relative_position_v = RelativePosition(
                 num_units=channels // self.num_heads,
-                max_relative_position=temporal_length, )
+                max_relative_position=temporal_length,
+            )
         self.proj_out_s = zero_module(conv_nd(1, channels, channels, 1))
         self.proj_out_t = zero_module(conv_nd(1, channels, channels, 1))
 
@@ -512,22 +489,21 @@ def forward(self, qkv, rp=None, mask=None):
         weight = paddle.einsum(
             "bct,bcs->bts",
             (q * scale).reshape([bs * self.n_heads, ch, length]),
-            (k * scale).reshape([bs * self.n_heads, ch, length]), )
+            (k * scale).reshape([bs * self.n_heads, ch, length]),
+        )
         if rp is not None:
             k_rp, v_rp = rp
             weight2 = paddle.einsum(
                 "bct,tsc->bst",
                 (q * scale).reshape([bs * self.n_heads, ch, length]),
-                k_rp, )
+                k_rp,
+            )
             weight += weight2
         if mask is not None:
             INF = -100000000.0
-            weight = paddle.where(
-                mask == 0, weight.astype(dtype="float32"), INF)
-        weight = paddle.nn.functional.softmax(
-            x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype)
-        a = paddle.einsum("bts,bcs->bct", weight,
-                          v.reshape([bs * self.n_heads, ch, length]))
+            weight = paddle.where(mask == 0, weight.astype(dtype="float32"), INF)
+        weight = paddle.nn.functional.softmax(x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype)
+        a = paddle.einsum("bts,bcs->bct", weight, v.reshape([bs * self.n_heads, ch, length]))
         if rp is not None:
             x = paddle.einsum("bts,tsc->btc", weight, v_rp)
             perm_3 = list(range(x.ndim))
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_distributions.py b/ppdiffusers/ppdiffusers/models/lvdm_distributions.py
index e2b9a88f4c4e5..a66cf086f85d7 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_distributions.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_distributions.py
@@ -58,21 +58,26 @@ def kl(self, other=None):
         elif other is None:
             return 0.5 * paddle.sum(
                 x=paddle.pow(x=self.mean, y=2) + self.var - 1.0 - self.logvar,
-                axis=[1, 2, 3], )
+                axis=[1, 2, 3],
+            )
         else:
             return 0.5 * paddle.sum(
-                x=paddle.pow(x=self.mean - other.mean, y=2) / other.var +
-                self.var / other.var - 1.0 - self.logvar + other.logvar,
-                axis=[1, 2, 3], )
+                x=paddle.pow(x=self.mean - other.mean, y=2) / other.var
+                + self.var / other.var
+                - 1.0
+                - self.logvar
+                + other.logvar,
+                axis=[1, 2, 3],
+            )
 
     def nll(self, sample, dims=[1, 2, 3]):
         if self.deterministic:
             return paddle.to_tensor(data=[0.0], dtype="float32")
         logtwopi = np.log(2.0 * np.pi)
         return 0.5 * paddle.sum(
-            x=logtwopi + self.logvar + paddle.pow(x=sample - self.mean, y=2) /
-            self.var,
-            axis=dims, )
+            x=logtwopi + self.logvar + paddle.pow(x=sample - self.mean, y=2) / self.var,
+            axis=dims,
+        )
 
     def mode(self):
         return self.mean
@@ -91,11 +96,11 @@ def normal_kl(mean1, logvar1, mean2, logvar2):
             tensor = obj
             break
     assert tensor is not None, "at least one argument must be a Tensor"
-    logvar1, logvar2 = [
-        (x if isinstance(x, paddle.Tensor) else paddle.to_tensor(data=x))
-        for x in (logvar1, logvar2)
-    ]
+    logvar1, logvar2 = [(x if isinstance(x, paddle.Tensor) else paddle.to_tensor(data=x)) for x in (logvar1, logvar2)]
     return 0.5 * (
-        -1.0 + logvar2 - logvar1 + paddle.exp(x=(logvar1 - logvar2
-                                                 ).astype("float32")) +
-        (mean1 - mean2)**2 * paddle.exp(x=(-logvar2).astype("float32")))
+        -1.0
+        + logvar2
+        - logvar1
+        + paddle.exp(x=(logvar1 - logvar2).astype("float32"))
+        + (mean1 - mean2) ** 2 * paddle.exp(x=(-logvar2).astype("float32"))
+    )
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py b/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py
index a48a260f655dd..512431be11300 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py
@@ -21,10 +21,16 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput
-from .lvdm_attention_temporal import (SpatialTemporalTransformer,
-                                      STAttentionBlock)
-from .lvdm_util import (avg_pool_nd, conv_nd, linear, nonlinearity,
-                        normalization, timestep_embedding, zero_module)
+from .lvdm_attention_temporal import SpatialTemporalTransformer, STAttentionBlock
+from .lvdm_util import (
+    avg_pool_nd,
+    conv_nd,
+    linear,
+    nonlinearity,
+    normalization,
+    timestep_embedding,
+    zero_module,
+)
 from .modeling_utils import ModelMixin
 
 
@@ -87,13 +93,14 @@ class Upsample(paddle.nn.Layer):
     """
 
     def __init__(
-            self,
-            channels,
-            use_conv,
-            dims=2,
-            out_channels=None,
-            kernel_size_t=3,
-            padding_t=1, ):
+        self,
+        channels,
+        use_conv,
+        dims=2,
+        out_channels=None,
+        kernel_size_t=3,
+        padding_t=1,
+    ):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -105,7 +112,8 @@ def __init__(
                 self.channels,
                 self.out_channels,
                 (kernel_size_t, 3, 3),
-                padding=(padding_t, 1, 1), )
+                padding=(padding_t, 1, 1),
+            )
 
     def forward(self, x):
         assert x.shape[1] == self.channels
@@ -114,10 +122,10 @@ def forward(self, x):
                 x=x,
                 size=(x.shape[2], x.shape[3] * 2, x.shape[4] * 2),
                 mode="nearest",
-                data_format="NCDHW", )
+                data_format="NCDHW",
+            )
         else:
-            x = paddle.nn.functional.interpolate(
-                x=x, scale_factor=2, mode="nearest")
+            x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="nearest")
         if self.use_conv:
             x = self.conv(x)
         return x
@@ -133,13 +141,14 @@ class Downsample(paddle.nn.Layer):
     """
 
     def __init__(
-            self,
-            channels,
-            use_conv,
-            dims=2,
-            out_channels=None,
-            kernel_size_t=3,
-            padding_t=1, ):
+        self,
+        channels,
+        use_conv,
+        dims=2,
+        out_channels=None,
+        kernel_size_t=3,
+        padding_t=1,
+    ):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -153,7 +162,8 @@ def __init__(
                 self.out_channels,
                 (kernel_size_t, 3, 3),
                 stride=stride,
-                padding=(padding_t, 1, 1), )
+                padding=(padding_t, 1, 1),
+            )
         else:
             assert self.channels == self.out_channels
             self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
@@ -179,21 +189,23 @@ class ResBlock(TimestepBlock):
     :param down: if True, use this block for downsampling.
     """
 
-    def __init__(self,
-                 channels,
-                 emb_channels,
-                 dropout,
-                 out_channels=None,
-                 use_conv=False,
-                 use_scale_shift_norm=False,
-                 dims=2,
-                 use_checkpoint=False,
-                 up=False,
-                 down=False,
-                 kernel_size_t=3,
-                 padding_t=1,
-                 nonlinearity_type="silu",
-                 **kwargs):
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+        kernel_size_t=3,
+        padding_t=1,
+        nonlinearity_type="silu",
+        **kwargs
+    ):
         super().__init__()
         self.channels = channels
         self.emb_channels = emb_channels
@@ -211,42 +223,25 @@ def __init__(self,
                 channels,
                 self.out_channels,
                 (kernel_size_t, 3, 3),
-                padding=(padding_t, 1, 1), ), )
+                padding=(padding_t, 1, 1),
+            ),
+        )
         self.updown = up or down
         if up:
-            self.h_upd = Upsample(
-                channels,
-                False,
-                dims,
-                kernel_size_t=kernel_size_t,
-                padding_t=padding_t)
-            self.x_upd = Upsample(
-                channels,
-                False,
-                dims,
-                kernel_size_t=kernel_size_t,
-                padding_t=padding_t)
+            self.h_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+            self.x_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
         elif down:
-            self.h_upd = Downsample(
-                channels,
-                False,
-                dims,
-                kernel_size_t=kernel_size_t,
-                padding_t=padding_t)
-            self.x_upd = Downsample(
-                channels,
-                False,
-                dims,
-                kernel_size_t=kernel_size_t,
-                padding_t=padding_t)
+            self.h_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+            self.x_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
         else:
             self.h_upd = self.x_upd = paddle.nn.Identity()
         self.emb_layers = paddle.nn.Sequential(
             nonlinearity(nonlinearity_type),
             linear(
                 emb_channels,
-                2 * self.out_channels
-                if use_scale_shift_norm else self.out_channels, ), )
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
         self.out_layers = paddle.nn.Sequential(
             normalization(self.out_channels),
             nonlinearity(nonlinearity_type),
@@ -257,7 +252,10 @@ def __init__(self,
                     self.out_channels,
                     self.out_channels,
                     (kernel_size_t, 3, 3),
-                    padding=(padding_t, 1, 1), )), )
+                    padding=(padding_t, 1, 1),
+                )
+            ),
+        )
         if self.out_channels == channels:
             self.skip_connection = paddle.nn.Identity()
         elif use_conv:
@@ -266,7 +264,8 @@ def __init__(self,
                 channels,
                 self.out_channels,
                 (kernel_size_t, 3, 3),
-                padding=(padding_t, 1, 1), )
+                padding=(padding_t, 1, 1),
+            )
         else:
             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
 
@@ -317,11 +316,9 @@ def _forward(self, x, emb):
 #     return STTransformerClass
 
 
-def make_spatialtemporal_transformer(module_name="attention_temporal",
-                                     class_name="SpatialTemporalTransformer"):
+def make_spatialtemporal_transformer(module_name="attention_temporal", class_name="SpatialTemporalTransformer"):
     # Todo: Support loading more types of transformers
-    assert (module_name == "attention_temporal" and
-            class_name == "SpatialTemporalTransformer")
+    assert module_name == "attention_temporal" and class_name == "SpatialTemporalTransformer"
     return SpatialTemporalTransformer
 
 
@@ -354,37 +351,39 @@ class LVDMUNet3DModel(ModelMixin, ConfigMixin):
     """
 
     @register_to_config
-    def __init__(self,
-                 image_size,
-                 in_channels,
-                 model_channels,
-                 out_channels,
-                 num_res_blocks,
-                 attention_resolutions,
-                 dropout=0,
-                 channel_mult=(1, 2, 4, 8),
-                 conv_resample=True,
-                 dims=3,
-                 num_classes=None,
-                 use_checkpoint=False,
-                 use_fp16=False,
-                 num_heads=-1,
-                 num_head_channels=-1,
-                 num_heads_upsample=-1,
-                 use_scale_shift_norm=False,
-                 resblock_updown=False,
-                 transformer_depth=1,
-                 context_dim=None,
-                 legacy=True,
-                 kernel_size_t=1,
-                 padding_t=1,
-                 use_temporal_transformer=False,
-                 temporal_length=None,
-                 use_relative_position=False,
-                 nonlinearity_type="silu",
-                 ST_transformer_module="attention_temporal",
-                 ST_transformer_class="SpatialTemporalTransformer",
-                 **kwargs):
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=3,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        transformer_depth=1,
+        context_dim=None,
+        legacy=True,
+        kernel_size_t=1,
+        padding_t=1,
+        use_temporal_transformer=False,
+        temporal_length=None,
+        use_relative_position=False,
+        nonlinearity_type="silu",
+        ST_transformer_module="attention_temporal",
+        ST_transformer_class="SpatialTemporalTransformer",
+        **kwargs
+    ):
         super().__init__()
         if use_temporal_transformer:
             assert (
@@ -401,11 +400,9 @@ def __init__(self,
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
         if num_heads == -1:
-            assert (num_head_channels != -1
-                    ), "Either num_heads or num_head_channels has to be set"
+            assert num_head_channels != -1, "Either num_heads or num_head_channels has to be set"
         if num_head_channels == -1:
-            assert (num_heads != -1
-                    ), "Either num_heads or num_head_channels has to be set"
+            assert num_heads != -1, "Either num_heads or num_head_channels has to be set"
         self.image_size = image_size
         self.in_channels = in_channels
         self.model_channels = model_channels
@@ -430,20 +427,26 @@ def __init__(self,
         self.time_embed = paddle.nn.Sequential(
             linear(model_channels, time_embed_dim),
             nonlinearity(nonlinearity_type),
-            linear(time_embed_dim, time_embed_dim), )
+            linear(time_embed_dim, time_embed_dim),
+        )
         if self.num_classes is not None:
             self.label_emb = paddle.nn.Embedding(num_classes, time_embed_dim)
         STTransformerClass = make_spatialtemporal_transformer(
-            module_name=ST_transformer_module, class_name=ST_transformer_class)
-        self.input_blocks = paddle.nn.LayerList(sublayers=[
-            TimestepEmbedSequential(
-                conv_nd(
-                    dims,
-                    in_channels,
-                    model_channels,
-                    (kernel_size_t, 3, 3),
-                    padding=(padding_t, 1, 1), ))
-        ])
+            module_name=ST_transformer_module, class_name=ST_transformer_class
+        )
+        self.input_blocks = paddle.nn.LayerList(
+            sublayers=[
+                TimestepEmbedSequential(
+                    conv_nd(
+                        dims,
+                        in_channels,
+                        model_channels,
+                        (kernel_size_t, 3, 3),
+                        padding=(padding_t, 1, 1),
+                    )
+                )
+            ]
+        )
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
@@ -462,7 +465,8 @@ def __init__(self,
                         kernel_size_t=kernel_size_t,
                         padding_t=padding_t,
                         nonlinearity_type=nonlinearity_type,
-                        **kwargs)
+                        **kwargs,
+                    )
                 ]
                 ch = mult * model_channels
                 if ds in attention_resolutions:
@@ -472,8 +476,7 @@ def __init__(self,
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if legacy:
-                        dim_head = (ch // num_heads if use_temporal_transformer
-                                    else num_head_channels)
+                        dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
                     layers.append(
                         STAttentionBlock(
                             ch,
@@ -481,8 +484,10 @@ def __init__(self,
                             num_heads=num_heads,
                             num_head_channels=dim_head,
                             temporal_length=temporal_length,
-                            use_relative_position=use_relative_position, )
-                        if not use_temporal_transformer else STTransformerClass(
+                            use_relative_position=use_relative_position,
+                        )
+                        if not use_temporal_transformer
+                        else STTransformerClass(
                             ch,
                             num_heads,
                             dim_head,
@@ -490,7 +495,9 @@ def __init__(self,
                             context_dim=context_dim,
                             temporal_length=temporal_length,
                             use_relative_position=use_relative_position,
-                            **kwargs))
+                            **kwargs,
+                        )
+                    )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
@@ -510,13 +517,19 @@ def __init__(self,
                             kernel_size_t=kernel_size_t,
                             padding_t=padding_t,
                             nonlinearity_type=nonlinearity_type,
-                            **kwargs) if resblock_updown else Downsample(
-                                ch,
-                                conv_resample,
-                                dims=dims,
-                                out_channels=out_ch,
-                                kernel_size_t=kernel_size_t,
-                                padding_t=padding_t, )))
+                            **kwargs,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            kernel_size_t=kernel_size_t,
+                            padding_t=padding_t,
+                        )
+                    )
+                )
                 ch = out_ch
                 input_block_chans.append(ch)
                 ds *= 2
@@ -527,8 +540,7 @@ def __init__(self,
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
         if legacy:
-            dim_head = (ch // num_heads
-                        if use_temporal_transformer else num_head_channels)
+            dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
         self.middle_block = TimestepEmbedSequential(
             ResBlock(
                 ch,
@@ -540,15 +552,18 @@ def __init__(self,
                 kernel_size_t=kernel_size_t,
                 padding_t=padding_t,
                 nonlinearity_type=nonlinearity_type,
-                **kwargs),
+                **kwargs,
+            ),
             STAttentionBlock(
                 ch,
                 use_checkpoint=use_checkpoint,
                 num_heads=num_heads,
                 num_head_channels=dim_head,
                 temporal_length=temporal_length,
-                use_relative_position=use_relative_position, )
-            if not use_temporal_transformer else STTransformerClass(
+                use_relative_position=use_relative_position,
+            )
+            if not use_temporal_transformer
+            else STTransformerClass(
                 ch,
                 num_heads,
                 dim_head,
@@ -556,7 +571,8 @@ def __init__(self,
                 context_dim=context_dim,
                 temporal_length=temporal_length,
                 use_relative_position=use_relative_position,
-                **kwargs),
+                **kwargs,
+            ),
             ResBlock(
                 ch,
                 time_embed_dim,
@@ -567,7 +583,9 @@ def __init__(self,
                 kernel_size_t=kernel_size_t,
                 padding_t=padding_t,
                 nonlinearity_type=nonlinearity_type,
-                **kwargs), )
+                **kwargs,
+            ),
+        )
         self._feature_size += ch
         self.output_blocks = paddle.nn.LayerList(sublayers=[])
         for level, mult in list(enumerate(channel_mult))[::-1]:
@@ -585,7 +603,8 @@ def __init__(self,
                         kernel_size_t=kernel_size_t,
                         padding_t=padding_t,
                         nonlinearity_type=nonlinearity_type,
-                        **kwargs)
+                        **kwargs,
+                    )
                 ]
                 ch = model_channels * mult
                 if ds in attention_resolutions:
@@ -595,8 +614,7 @@ def __init__(self,
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if legacy:
-                        dim_head = (ch // num_heads if use_temporal_transformer
-                                    else num_head_channels)
+                        dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
                     layers.append(
                         STAttentionBlock(
                             ch,
@@ -604,8 +622,10 @@ def __init__(self,
                             num_heads=num_heads,
                             num_head_channels=dim_head,
                             temporal_length=temporal_length,
-                            use_relative_position=use_relative_position, )
-                        if not use_temporal_transformer else STTransformerClass(
+                            use_relative_position=use_relative_position,
+                        )
+                        if not use_temporal_transformer
+                        else STTransformerClass(
                             ch,
                             num_heads,
                             dim_head,
@@ -613,7 +633,9 @@ def __init__(self,
                             context_dim=context_dim,
                             temporal_length=temporal_length,
                             use_relative_position=use_relative_position,
-                            **kwargs))
+                            **kwargs,
+                        )
+                    )
                 if level and i == num_res_blocks:
                     out_ch = ch
                     layers.append(
@@ -629,13 +651,18 @@ def __init__(self,
                             kernel_size_t=kernel_size_t,
                             padding_t=padding_t,
                             nonlinearity_type=nonlinearity_type,
-                            **kwargs) if resblock_updown else Upsample(
-                                ch,
-                                conv_resample,
-                                dims=dims,
-                                out_channels=out_ch,
-                                kernel_size_t=kernel_size_t,
-                                padding_t=padding_t, ))
+                            **kwargs,
+                        )
+                        if resblock_updown
+                        else Upsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            kernel_size_t=kernel_size_t,
+                            padding_t=padding_t,
+                        )
+                    )
                     ds //= 2
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
@@ -648,7 +675,10 @@ def __init__(self,
                     model_channels,
                     out_channels,
                     (kernel_size_t, 3, 3),
-                    padding=(padding_t, 1, 1), )), )
+                    padding=(padding_t, 1, 1),
+                )
+            ),
+        )
 
     def convert_to_fp16(self):
         """
@@ -666,13 +696,7 @@ def convert_to_fp32(self):
         self.middle_block.apply(fn=convert_module_to_f32)
         self.output_blocks.apply(fn=convert_module_to_f32)
 
-    def forward(self,
-                x,
-                timesteps=None,
-                time_emb_replace=None,
-                context=None,
-                y=None,
-                **kwargs):
+    def forward(self, x, timesteps=None, time_emb_replace=None, context=None, y=None, **kwargs):
         """
         Apply the model to an input batch.
         :param x: an [N x C x ...] Tensor of inputs.
@@ -683,13 +707,12 @@ def forward(self,
         """
         hs = []
         if time_emb_replace is None:
-            t_emb = timestep_embedding(
-                timesteps, self.model_channels, repeat_only=False)
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
             emb = self.time_embed(t_emb)
         else:
             emb = time_emb_replace
         if y is not None:
-            assert y.shape == (x.shape[0], )
+            assert y.shape == (x.shape[0],)
             emb = emb + self.label_emb(y)
         h = x.astype(self.dtype)
         for module in self.input_blocks:
@@ -711,42 +734,30 @@ class FrameInterpPredUNet(LVDMUNet3DModel):
     may need to input `mask` to indicate condition, as well as noise level `s` for condition augmentation.
     """
 
-    def __init__(self,
-                 image_size,
-                 in_channels,
-                 cond_aug_mode=None,
-                 *args,
-                 **kwargs):
+    def __init__(self, image_size, in_channels, cond_aug_mode=None, *args, **kwargs):
         super().__init__(image_size, in_channels, *args, **kwargs)
         if cond_aug_mode == "time_embed":
             self.time_embed_cond = paddle.nn.Sequential(
                 linear(self.model_channels, self.time_embed_dim),
                 nonlinearity(self.nonlinearity_type),
-                linear(self.time_embed_dim, self.time_embed_dim), )
+                linear(self.time_embed_dim, self.time_embed_dim),
+            )
         elif cond_aug_mode == "learned_embed":
             pass
 
-    def forward(self,
-                x,
-                timesteps,
-                context=None,
-                y=None,
-                s=None,
-                mask=None,
-                **kwargs):
+    def forward(self, x, timesteps, context=None, y=None, s=None, mask=None, **kwargs):
         if s is not None:
-            s_emb = timestep_embedding(
-                s, self.model_channels, repeat_only=False)
+            s_emb = timestep_embedding(s, self.model_channels, repeat_only=False)
             s_emb = self.time_embed_cond(s_emb)
-            t_emb = timestep_embedding(
-                timesteps, self.model_channels, repeat_only=False)
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
             emb = self.time_embed(t_emb)
             assert emb.dim() == 2
             mask_ = mask[:, :, :, (0), (0)]
             t = mask.shape[2]
-            emb_mix = (emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) *
-                       (1 - mask_) + s_emb.unsqueeze(axis=2).tile(
-                           repeat_times=[1, 1, t]) * mask_)
+            emb_mix = (
+                emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * (1 - mask_)
+                + s_emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * mask_
+            )
             assert emb_mix.dim() == 3
             emb_mix = rearrange(emb_mix, "b c t -> b t c")
             time_emb_replace = emb_mix
@@ -754,10 +765,4 @@ def forward(self,
         else:
             time_emb_replace = None
             timesteps = timesteps
-        return super().forward(
-            x,
-            timesteps,
-            time_emb_replace=time_emb_replace,
-            context=context,
-            y=y,
-            **kwargs)
+        return super().forward(x, timesteps, time_emb_replace=time_emb_replace, context=context, y=y, **kwargs)
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_util.py b/ppdiffusers/ppdiffusers/models/lvdm_util.py
index 18551f6900d0f..a3c8faa7fb7fe 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_util.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_util.py
@@ -27,7 +27,7 @@ def make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2):
     """
     mask = paddle.zeros(shape=[t])
     mask[:n_interp1] = 1
-    mask[t - n_interp2:] = 1
+    mask[t - n_interp2 :] = 1
     return mask
 
 
@@ -42,14 +42,15 @@ def make_interp_mask_with_framestride(t, device, frame_stride):
 
 
 def random_temporal_masking(
-        input_shape,
-        p_interp,
-        p_pred,
-        device,
-        n_interp1=1,
-        n_interp2=1,
-        n_prevs=[1],
-        interp_frame_stride=None, ):
+    input_shape,
+    p_interp,
+    p_pred,
+    device,
+    n_interp1=1,
+    n_interp2=1,
+    n_prevs=[1],
+    interp_frame_stride=None,
+):
     """return mask for masking input, where 1 indicates given real image as condition,
     0 indicates noisy samples.
     """
@@ -61,11 +62,9 @@ def random_temporal_masking(
         r = random.random()
         if r < p_interp:
             if interp_frame_stride is not None:
-                mask[i] = make_interp_mask_with_framestride(t, device,
-                                                            interp_frame_stride)
+                mask[i] = make_interp_mask_with_framestride(t, device, interp_frame_stride)
             else:
-                mask[i] = make_interp_mask_with_bothsidescond(
-                    t, device, n_interp1, n_interp2)
+                mask[i] = make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2)
         elif p_interp <= r < p_interp + p_pred:
             n_pred = random.choice(n_prevs)
             mask[(i), :n_pred] = 1
@@ -76,51 +75,35 @@ def random_temporal_masking(
     return mask
 
 
-def make_beta_schedule(schedule,
-                       n_timestep,
-                       linear_start=0.0001,
-                       linear_end=0.02,
-                       cosine_s=0.008):
+def make_beta_schedule(schedule, n_timestep, linear_start=0.0001, linear_end=0.02, cosine_s=0.008):
     if schedule == "linear":
-        betas = (paddle.linspace(
-            start=linear_start**0.5, stop=linear_end**0.5,
-            num=n_timestep).astype("float64")**2)
+        betas = (
+            paddle.linspace(start=linear_start**0.5, stop=linear_end**0.5, num=n_timestep).astype("float64") ** 2
+        )
     elif schedule == "cosine":
-        timesteps = (paddle.arange(end=n_timestep + 1).astype("float64") /
-                     n_timestep + cosine_s)
+        timesteps = paddle.arange(end=n_timestep + 1).astype("float64") / n_timestep + cosine_s
         alphas = timesteps / (1 + cosine_s) * np.pi / 2
         alphas = paddle.cos(x=alphas).pow(y=2)
         alphas = alphas / alphas[0]
         betas = 1 - alphas[1:] / alphas[:-1]
         betas = np.clip(betas, a_min=0, a_max=0.999)
     elif schedule == "sqrt_linear":
-        betas = paddle.linspace(
-            start=linear_start, stop=linear_end,
-            num=n_timestep).astype("float64")
+        betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64")
     elif schedule == "sqrt":
-        betas = (paddle.linspace(
-            start=linear_start, stop=linear_end,
-            num=n_timestep).astype("float64")**0.5)
+        betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64") ** 0.5
     else:
         raise ValueError(f"schedule '{schedule}' unknown.")
     return betas.numpy()
 
 
-def make_ddim_timesteps(ddim_discr_method,
-                        num_ddim_timesteps,
-                        num_ddpm_timesteps,
-                        verbose=True):
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
     if ddim_discr_method == "uniform":
         c = num_ddpm_timesteps // num_ddim_timesteps
         ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
     elif ddim_discr_method == "quad":
-        ddim_timesteps = (np.linspace(0,
-                                      np.sqrt(num_ddpm_timesteps * 0.8),
-                                      num_ddim_timesteps)**2).astype(int)
+        ddim_timesteps = (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps) ** 2).astype(int)
     else:
-        raise NotImplementedError(
-            f'There is no ddim discretization method called "{ddim_discr_method}"'
-        )
+        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
     steps_out = ddim_timesteps + 1
     if verbose:
         print(f"Selected timesteps for ddim sampler: {steps_out}")
@@ -129,14 +112,10 @@ def make_ddim_timesteps(ddim_discr_method,
 
 def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
     alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]]
-                             .tolist())
-    sigmas = eta * np.sqrt(
-        (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
     if verbose:
-        print(
-            f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}"
-        )
+        print(f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}")
         print(
             f"For the chosen value of eta, which is {eta}, this results in the following sigma_t schedule for ddim sampler {sigmas}"
         )
@@ -165,7 +144,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
 def extract_into_tensor(a, t, x_shape):
     b, *_ = t.shape
     out = a.take_along_axis(axis=-1, indices=t)
-    return out.reshape([b, *((1, ) * (len(x_shape) - 1))])
+    return out.reshape([b, *((1,) * (len(x_shape) - 1))])
 
 
 def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
@@ -179,14 +158,13 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
     """
     if not repeat_only:
         half = dim // 2
-        freqs = paddle.exp(x=(-math.log(max_period) * paddle.arange(
-            start=0, end=half).astype("float32") / half).astype("float32"))
+        freqs = paddle.exp(
+            x=(-math.log(max_period) * paddle.arange(start=0, end=half).astype("float32") / half).astype("float32")
+        )
         args = timesteps[:, (None)].astype(dtype="float32") * freqs[None]
-        embedding = paddle.concat(
-            x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1)
+        embedding = paddle.concat(x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1)
         if dim % 2:
-            embedding = paddle.concat(
-                x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1)
+            embedding = paddle.concat(x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1)
     else:
         embedding = repeat(timesteps, "b -> b d", d=dim)
     return embedding
@@ -232,7 +210,8 @@ def Normalize(in_channels):
         num_channels=in_channels,
         epsilon=1e-06,
         weight_attr=None,
-        bias_attr=None, )
+        bias_attr=None,
+    )
 
 
 def identity(*args, **kwargs):
@@ -249,8 +228,7 @@ def nonlinearity(type="silu"):
 class GEGLU(paddle.nn.Layer):
     def __init__(self, dim_in, dim_out):
         super().__init__()
-        self.proj = paddle.nn.Linear(
-            in_features=dim_in, out_features=dim_out * 2)
+        self.proj = paddle.nn.Linear(in_features=dim_in, out_features=dim_out * 2)
 
     def forward(self, x):
         x, gate = self.proj(x).chunk(chunks=2, axis=-1)
diff --git a/ppdiffusers/ppdiffusers/models/lvdm_vae.py b/ppdiffusers/ppdiffusers/models/lvdm_vae.py
index 88c1e8a5ac1f0..089afdf908e94 100644
--- a/ppdiffusers/ppdiffusers/models/lvdm_vae.py
+++ b/ppdiffusers/ppdiffusers/models/lvdm_vae.py
@@ -24,11 +24,7 @@
 
 def conv3d(in_channels, out_channels, kernel_size, conv3d_type="SamePadConv3d"):
     if conv3d_type == "SamePadConv3d":
-        return SamePadConv3d(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            padding_type="replicate")
+        return SamePadConv3d(in_channels, out_channels, kernel_size=kernel_size, padding_type="replicate")
     else:
         raise NotImplementedError
 
@@ -50,23 +46,24 @@ class AutoencoderKLOutput(BaseOutput):
 class LVDMAutoencoderKL(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
-            self,
-            n_hiddens=32,
-            downsample=[4, 8, 8],
-            z_channels=4,
-            double_z=True,
-            image_channel=3,
-            norm_type="group",
-            padding_type="replicate",
-            upsample=[4, 8, 8],
-            embed_dim=4,
-            # ckpt_path=None,
-            # ignore_keys=[],
-            image_key="image",
-            monitor=None,
-            std=1.0,
-            mean=0.0,
-            prob=0.2, ):
+        self,
+        n_hiddens=32,
+        downsample=[4, 8, 8],
+        z_channels=4,
+        double_z=True,
+        image_channel=3,
+        norm_type="group",
+        padding_type="replicate",
+        upsample=[4, 8, 8],
+        embed_dim=4,
+        # ckpt_path=None,
+        # ignore_keys=[],
+        image_key="image",
+        monitor=None,
+        std=1.0,
+        mean=0.0,
+        prob=0.2,
+    ):
         super().__init__()
         self.image_key = image_key
         # pass init params to Encoder
@@ -77,7 +74,8 @@ def __init__(
             double_z=double_z,
             image_channel=image_channel,
             norm_type=norm_type,
-            padding_type=padding_type, )
+            padding_type=padding_type,
+        )
 
         # pass init params to Decoder
         self.decoder = Decoder(
@@ -85,7 +83,8 @@ def __init__(
             upsample=upsample,
             z_channels=z_channels,
             image_channel=image_channel,
-            norm_type="group", )
+            norm_type="group",
+        )
 
         self.quant_conv = conv3d(2 * z_channels, 2 * embed_dim, 1)
         self.post_quant_conv = conv3d(embed_dim, z_channels, 1)
diff --git a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
index bf8d26d5beaf5..213b2efdd2ca9 100644
--- a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
@@ -20,9 +20,7 @@
 #####################
 
 
-def convert_pytorch_state_dict_to_paddle(pt_state_dict,
-                                         paddle_model: nn.Layer,
-                                         sub_layer=None):
+def convert_pytorch_state_dict_to_paddle(pt_state_dict, paddle_model: nn.Layer, sub_layer=None):
     # Step 1: Find Linear layer which need transpose weight
     linear_need_transpose = []
     for k, v in paddle_model.named_sublayers(include_self=True):
@@ -51,7 +49,7 @@ def convert_pytorch_state_dict_to_paddle(pt_state_dict,
             pt_tensor = pt_tensor.T
         # (2) 0d tensor -> 1d tensor
         if pt_tensor.ndim == 0:
-            pt_tensor = pt_tensor.reshape((1, ))
+            pt_tensor = pt_tensor.reshape((1,))
         # (3) name mapping
         for old_key, new_key in ptname2pdname.items():
             pt_key = pt_key.replace(old_key, new_key)
@@ -61,10 +59,7 @@ def convert_pytorch_state_dict_to_paddle(pt_state_dict,
 
 
 @classmethod
-def convert_pytorch_state_dict_to_paddle_class_method(cls,
-                                                      pt_state_dict,
-                                                      paddle_model: nn.Layer,
-                                                      sub_layer=None):
+def convert_pytorch_state_dict_to_paddle_class_method(cls, pt_state_dict, paddle_model: nn.Layer, sub_layer=None):
     # Step 1: Find Linear layer which need transpose weight
     linear_need_transpose = []
     for k, v in paddle_model.named_sublayers(include_self=True):
@@ -96,7 +91,7 @@ def convert_pytorch_state_dict_to_paddle_class_method(cls,
             pt_tensor = pt_tensor.T
         # (2) 0d tensor -> 1d tensor
         if pt_tensor.ndim == 0:
-            pt_tensor = pt_tensor.reshape((1, ))
+            pt_tensor = pt_tensor.reshape((1,))
         # (3) name mapping
         for old_key, new_key in ptname2pdname.items():
             pt_key = pt_key.replace(old_key, new_key)
@@ -137,9 +132,7 @@ def convert_paddle_state_dict_to_pytorch(pd_state_dict, paddle_model: nn.Layer):
             pd_key = pd_key.replace(new_key, old_key)
         if hasattr(paddle_model, "paddle_torch_name_mapping"):
             pd_key = paddle_model.paddle_torch_name_mapping.get(pd_key, pd_key)
-        pytorch_state_dict[pd_key] = (pd_tensor.contiguous()
-                                      if hasattr(pd_tensor, "contiguous") else
-                                      pd_tensor)
+        pytorch_state_dict[pd_key] = pd_tensor.contiguous() if hasattr(pd_tensor, "contiguous") else pd_tensor
     return pytorch_state_dict
 
 
diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py
index 27514475bc7c2..bf9ed3663d724 100644
--- a/ppdiffusers/ppdiffusers/models/modeling_utils.py
+++ b/ppdiffusers/ppdiffusers/models/modeling_utils.py
@@ -21,16 +21,33 @@
 import paddle
 import paddle.nn as nn
 
-from ..utils import (CONFIG_NAME, DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB,
-                     HF_HUB_OFFLINE, LOW_CPU_MEM_USAGE_DEFAULT,
-                     PADDLE_WEIGHTS_NAME, PPDIFFUSERS_CACHE, TO_DIFFUSERS,
-                     TORCH_SAFETENSORS_WEIGHTS_NAME, TORCH_WEIGHTS_NAME,
-                     _add_variant, _get_model_file, deprecate,
-                     is_paddlenlp_available, is_safetensors_available,
-                     is_torch_available, is_torch_file, logging, smart_load)
+from ..utils import (
+    CONFIG_NAME,
+    DIFFUSERS_CACHE,
+    FROM_DIFFUSERS,
+    FROM_HF_HUB,
+    HF_HUB_OFFLINE,
+    LOW_CPU_MEM_USAGE_DEFAULT,
+    PADDLE_WEIGHTS_NAME,
+    PPDIFFUSERS_CACHE,
+    TO_DIFFUSERS,
+    TORCH_SAFETENSORS_WEIGHTS_NAME,
+    TORCH_WEIGHTS_NAME,
+    _add_variant,
+    _get_model_file,
+    deprecate,
+    is_paddlenlp_available,
+    is_safetensors_available,
+    is_torch_available,
+    is_torch_file,
+    logging,
+    smart_load,
+)
 from ..version import VERSION as __version__
 from .modeling_pytorch_paddle_utils import (
-    convert_paddle_state_dict_to_pytorch, convert_pytorch_state_dict_to_paddle)
+    convert_paddle_state_dict_to_pytorch,
+    convert_pytorch_state_dict_to_paddle,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -87,11 +104,7 @@ def convert_state_dict(state_dict, framework="torch"):
         state_dict = {k: v.cpu().numpy() for k, v in state_dict.items()}
         return state_dict
     elif framework in ["paddle", "pd"]:
-        state_dict = {
-            k: paddle.to_tensor(
-                v, place="cpu")
-            for k, v in state_dict.items()
-        }
+        state_dict = {k: paddle.to_tensor(v, place="cpu") for k, v in state_dict.items()}
         return state_dict
     else:
         raise NotImplementedError(f"Not Implemented {framework} framework!")
@@ -129,9 +142,7 @@ class ModelMixin(nn.Layer):
           [`~models.ModelMixin.save_pretrained`].
     """
     config_name = CONFIG_NAME
-    _automatically_saved_args = [
-        "_ppdiffusers_version", "_class_name", "_name_or_path"
-    ]
+    _automatically_saved_args = ["_ppdiffusers_version", "_class_name", "_name_or_path"]
     _supports_gradient_checkpointing = False
 
     def __init__(self):
@@ -144,8 +155,7 @@ def __getattr__(self, name: str) -> Any:
         https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
         """
 
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(
-            self.__dict__["_internal_dict"], name)
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
         is_attribute = name in self.__dict__
 
         if is_in_config and not is_attribute:
@@ -155,7 +165,8 @@ def __getattr__(self, name: str) -> Any:
                 "1.0.0",
                 deprecation_message,
                 standard_warn=False,
-                stacklevel=3, )
+                stacklevel=3,
+            )
             return self._internal_dict[name]
 
         # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
@@ -171,7 +182,8 @@ def is_gradient_checkpointing(self) -> bool:
         """
         return any(
             hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing
-            for m in self.sublayers(include_self=True))
+            for m in self.sublayers(include_self=True)
+        )
 
     def enable_gradient_checkpointing(self):
         """
@@ -181,9 +193,7 @@ def enable_gradient_checkpointing(self):
         activations".
         """
         if not self._supports_gradient_checkpointing:
-            raise ValueError(
-                f"{self.__class__.__name__} does not support gradient checkpointing."
-            )
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
         self.apply(partial(self._set_gradient_checkpointing, value=True))
 
     def disable_gradient_checkpointing(self):
@@ -196,15 +206,13 @@ def disable_gradient_checkpointing(self):
         if self._supports_gradient_checkpointing:
             self.apply(partial(self._set_gradient_checkpointing, value=False))
 
-    def set_use_memory_efficient_attention_xformers(
-            self, valid: bool, attention_op: Optional[str]=None) -> None:
+    def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
         # Recursively walk through all the children.
         # Any children which exposes the set_use_memory_efficient_attention_xformers method
         # gets the message
         def fn_recursive_set_mem_eff(module: nn.Layer):
             if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid,
-                                                                   attention_op)
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
 
             for child in module.children():
                 fn_recursive_set_mem_eff(child)
@@ -213,8 +221,7 @@ def fn_recursive_set_mem_eff(module: nn.Layer):
             if isinstance(module, nn.Layer):
                 fn_recursive_set_mem_eff(module)
 
-    def enable_xformers_memory_efficient_attention(
-            self, attention_op: Optional[str]=None):
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
         r"""
         Enable memory efficient attention as implemented in xformers.
 
@@ -249,13 +256,14 @@ def disable_xformers_memory_efficient_attention(self):
         self.set_use_memory_efficient_attention_xformers(False)
 
     def save_pretrained(
-            self,
-            save_directory: Union[str, os.PathLike],
-            is_main_process: bool=True,
-            save_function: Callable=None,
-            safe_serialization: bool=False,
-            variant: Optional[str]=None,
-            to_diffusers: Optional[bool]=None, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        to_diffusers: Optional[bool] = None,
+    ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
         `[`~models.ModelMixin.from_pretrained`]` class method.
@@ -280,16 +288,11 @@ def save_pretrained(
         """
         if to_diffusers is None:
             to_diffusers = TO_DIFFUSERS
-        if to_diffusers and safe_serialization and not is_safetensors_available(
-        ):
-            raise ImportError(
-                "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
-            )
+        if to_diffusers and safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
 
         if os.path.isfile(save_directory):
-            logger.error(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
 
         os.makedirs(save_directory, exist_ok=True)
@@ -314,14 +317,11 @@ def save_pretrained(
                 if safe_serialization:
                     if is_torch_available():
                         save_function = safetensors_torch_save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="torch")
+                        state_dict = convert_state_dict(state_dict, framework="torch")
                     else:
                         save_function = safetensors_numpy_save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="numpy")
-                    weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME,
-                                                variant)
+                        state_dict = convert_state_dict(state_dict, framework="numpy")
+                    weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant)
                 else:
                     if not is_torch_available():
                         raise ImportError(
@@ -329,11 +329,9 @@ def save_pretrained(
                         )
                     save_function = torch.save
                     weights_name = _add_variant(TORCH_WEIGHTS_NAME, variant)
-                    state_dict = convert_state_dict(
-                        state_dict, framework="torch")
+                    state_dict = convert_state_dict(state_dict, framework="torch")
 
-                state_dict = convert_paddle_state_dict_to_pytorch(state_dict,
-                                                                  model_to_save)
+                state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save)
             else:
                 save_function = paddle.save
                 weights_name = _add_variant(PADDLE_WEIGHTS_NAME, variant)
@@ -341,15 +339,10 @@ def save_pretrained(
         # Save the model
         save_function(state_dict, os.path.join(save_directory, weights_name))
 
-        logger.info(
-            f"Model weights saved in {os.path.join(save_directory, weights_name)}"
-        )
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
 
     @classmethod
-    def from_pretrained(
-            cls,
-            pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
-            **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
@@ -425,8 +418,9 @@ def from_pretrained(
 
         """
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
         force_download = kwargs.pop("force_download", False)
         from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
@@ -439,13 +433,11 @@ def from_pretrained(
         paddle_dtype = kwargs.pop("paddle_dtype", None)
         subfolder = kwargs.pop("subfolder", None)
         ignore_keys = kwargs.pop("ignore_keys", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage",
-                                       LOW_CPU_MEM_USAGE_DEFAULT)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
         variant = kwargs.pop("variant", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
 
-        if from_diffusers and use_safetensors and not is_safetensors_available(
-        ):
+        if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
                 "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
             )
@@ -476,7 +468,8 @@ def from_pretrained(
             subfolder=subfolder,
             user_agent=user_agent,
             from_hf_hub=from_hf_hub,  # whether or not from_hf_hub
-            **kwargs, )
+            **kwargs,
+        )
 
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # Load model
@@ -486,8 +479,7 @@ def from_pretrained(
                 try:
                     model_file = _get_model_file(
                         pretrained_model_name_or_path,
-                        weights_name=_add_variant(
-                            TORCH_SAFETENSORS_WEIGHTS_NAME, variant),
+                        weights_name=_add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant),
                         cache_dir=cache_dir,
                         force_download=force_download,
                         resume_download=resume_download,
@@ -498,7 +490,8 @@ def from_pretrained(
                         subfolder=subfolder,
                         user_agent=user_agent,
                         commit_hash=commit_hash,
-                        from_hf_hub=from_hf_hub, )
+                        from_hf_hub=from_hf_hub,
+                    )
                     # try load model_file with paddle / torch / safetensor
                     state_dict = smart_load(model_file)
                 except Exception:
@@ -518,7 +511,8 @@ def from_pretrained(
                     subfolder=subfolder,
                     user_agent=user_agent,
                     commit_hash=commit_hash,
-                    from_hf_hub=from_hf_hub, )
+                    from_hf_hub=from_hf_hub,
+                )
                 # try load model_file with paddle / torch / safetensor
                 state_dict = smart_load(model_file)
         else:
@@ -535,18 +529,19 @@ def from_pretrained(
                 subfolder=subfolder,
                 user_agent=user_agent,
                 commit_hash=commit_hash,
-                from_hf_hub=from_hf_hub, )
+                from_hf_hub=from_hf_hub,
+            )
             # try load model_file with paddle / torch / safetensor
             state_dict = smart_load(model_file)
 
         init_contexts = []
 
-        dtype = set(v.dtype for v in state_dict.values()
-                    if paddle.is_tensor(v) and paddle.is_floating_point(v))
+        dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v))
         if len(dtype) > 1 and paddle.float32 not in dtype:
             raise ValueError(
                 f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please"
-                f" make sure that {model_file} weights have only one dtype.")
+                f" make sure that {model_file} weights have only one dtype."
+            )
         elif len(dtype) > 1 and paddle.float32 in dtype:
             dtype = paddle.float32
         elif len(dtype) == 0:
@@ -580,21 +575,16 @@ def from_pretrained(
             for k in keys:
                 for ik in ignore_keys:
                     if k.startswith(ik):
-                        logger.warning(
-                            "Deleting key {} from state_dict.".format(k))
+                        logger.warning("Deleting key {} from state_dict.".format(k))
                         del state_dict[k]
 
-        (
+        (model, missing_keys, unexpected_keys, mismatched_keys, error_msgs,) = cls._load_pretrained_model(
             model,
-            missing_keys,
-            unexpected_keys,
-            mismatched_keys,
-            error_msgs, ) = cls._load_pretrained_model(
-                model,
-                state_dict,
-                model_file,
-                pretrained_model_name_or_path,
-                ignore_mismatched_sizes=ignore_mismatched_sizes, )
+            state_dict,
+            model_file,
+            pretrained_model_name_or_path,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+        )
 
         loading_info = {
             "missing_keys": missing_keys,
@@ -621,12 +611,13 @@ def from_pretrained(
 
     @classmethod
     def _load_pretrained_model(
-            cls,
-            model,
-            state_dict,
-            resolved_archive_file,
-            pretrained_model_name_or_path,
-            ignore_mismatched_sizes=False, ):
+        cls,
+        model,
+        state_dict,
+        resolved_archive_file,
+        pretrained_model_name_or_path,
+        ignore_mismatched_sizes=False,
+    ):
         # Retrieve missing & unexpected_keys
         model_state_dict = model.state_dict()
         loaded_keys = list(state_dict.keys())
@@ -642,21 +633,25 @@ def _load_pretrained_model(
         model_to_load = model
 
         def _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                loaded_keys,
-                ignore_mismatched_sizes, ):
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
             mismatched_keys = []
             for checkpoint_key in loaded_keys:
                 model_key = checkpoint_key
 
-                if model_key in model_state_dict and list(state_dict[
-                        checkpoint_key].shape) != list(model_state_dict[
-                            model_key].shape):
-                    mismatched_keys.append((
-                        checkpoint_key,
-                        state_dict[checkpoint_key].shape,
-                        model_state_dict[model_key].shape, ))
+                if model_key in model_state_dict and list(state_dict[checkpoint_key].shape) != list(
+                    model_state_dict[model_key].shape
+                ):
+                    mismatched_keys.append(
+                        (
+                            checkpoint_key,
+                            state_dict[checkpoint_key].shape,
+                            model_state_dict[model_key].shape,
+                        )
+                    )
                     del state_dict[checkpoint_key]
             if ignore_mismatched_sizes:
                 mismatched_keys = []
@@ -668,7 +663,8 @@ def _find_mismatched_keys(
                 state_dict,
                 model_state_dict,
                 original_loaded_keys,
-                ignore_mismatched_sizes, )
+                ignore_mismatched_sizes,
+            )
             error_msgs = []
             for key_name, loaded_shape, model_shape in mismatched_keys:
                 error_msgs.append(
@@ -679,10 +675,10 @@ def _find_mismatched_keys(
         if len(error_msgs) > 0:
             error_msg = "\n\t".join(error_msgs)
             if "size mismatch" in error_msg:
-                error_msg += "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
-            raise RuntimeError(
-                f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}"
-            )
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
 
         if len(unexpected_keys) > 0:
             logger.warning(
@@ -693,11 +689,10 @@ def _find_mismatched_keys(
                 " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
                 f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
                 " identical (initializing a BertForSequenceClassification model from a"
-                " BertForSequenceClassification model).")
-        else:
-            logger.info(
-                f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
+                " BertForSequenceClassification model)."
             )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
         if len(missing_keys) > 0:
             logger.warning(
                 f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
@@ -709,17 +704,21 @@ def _find_mismatched_keys(
                 f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
                 f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
                 f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                " without further training.")
+                " without further training."
+            )
         if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join([
-                f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                for key, shape1, shape2 in mismatched_keys
-            ])
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
             logger.warning(
                 f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
                 f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
                 f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                " able to use it for predictions and inference.")
+                " able to use it for predictions and inference."
+            )
 
         return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
 
@@ -738,9 +737,7 @@ def dtype(self) -> paddle.dtype:
         """
         return get_parameter_dtype(self)
 
-    def num_parameters(self,
-                       only_trainable: bool=False,
-                       exclude_embeddings: bool=False) -> int:
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
         """
         Get number of (optionally, trainable or non-embeddings) parameters in the module.
 
@@ -762,14 +759,11 @@ def num_parameters(self,
                 if isinstance(module_type, nn.Embedding)
             ]
             non_embedding_parameters = [
-                parameter for name, parameter in self.named_parameters()
-                if name not in embedding_param_names
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
             ]
-            return sum(p.numel() for p in non_embedding_parameters
-                       if not p.stop_gradient or not only_trainable)
+            return sum(p.numel() for p in non_embedding_parameters if not p.stop_gradient or not only_trainable)
         else:
-            return sum(p.numel() for p in self.parameters()
-                       if not p.stop_gradient or not only_trainable)
+            return sum(p.numel() for p in self.parameters() if not p.stop_gradient or not only_trainable)
 
 
 def unfreeze_params(params):
diff --git a/ppdiffusers/ppdiffusers/models/prior_transformer.py b/ppdiffusers/ppdiffusers/models/prior_transformer.py
index 8d1b6af0782a0..90c1da6ee3232 100644
--- a/ppdiffusers/ppdiffusers/models/prior_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/prior_transformer.py
@@ -65,14 +65,15 @@ class PriorTransformer(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_attention_heads: int=32,
-            attention_head_dim: int=64,
-            num_layers: int=20,
-            embedding_dim: int=768,
-            num_embeddings=77,
-            additional_embeddings=4,
-            dropout: float=0.0, ):
+        self,
+        num_attention_heads: int = 32,
+        attention_head_dim: int = 64,
+        num_layers: int = 20,
+        embedding_dim: int = 768,
+        num_embeddings=77,
+        additional_embeddings=4,
+        dropout: float = 0.0,
+    ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
@@ -90,20 +91,26 @@ def __init__(
         self.positional_embedding = self.create_parameter(
             (1, num_embeddings + additional_embeddings, inner_dim),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0), )
+            default_initializer=nn.initializer.Constant(0.0),
+        )
         self.prd_embedding = self.create_parameter(
             (1, 1, inner_dim),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0), )
-        self.transformer_blocks = nn.LayerList([
-            BasicTransformerBlock(
-                inner_dim,
-                num_attention_heads,
-                attention_head_dim,
-                dropout=dropout,
-                activation_fn="gelu",
-                attention_bias=True, ) for d in range(num_layers)
-        ])
+            default_initializer=nn.initializer.Constant(0.0),
+        )
+        self.transformer_blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    activation_fn="gelu",
+                    attention_bias=True,
+                )
+                for d in range(num_layers)
+            ]
+        )
 
         self.norm_out = nn.LayerNorm(inner_dim)
         self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim)
@@ -114,29 +121,33 @@ def __init__(
                     num_embeddings + additional_embeddings,
                     num_embeddings + additional_embeddings,
                 ],
-                NEG_INF, ),
-            1, )
+                NEG_INF,
+            ),
+            1,
+        )
         causal_attention_mask = causal_attention_mask.unsqueeze(0)
-        self.register_buffer(
-            "causal_attention_mask", causal_attention_mask, persistable=False)
+        self.register_buffer("causal_attention_mask", causal_attention_mask, persistable=False)
 
         self.clip_mean = self.create_parameter(
             (1, embedding_dim),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0), )
+            default_initializer=nn.initializer.Constant(0.0),
+        )
         self.clip_std = self.create_parameter(
             (1, embedding_dim),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0), )
+            default_initializer=nn.initializer.Constant(0.0),
+        )
 
     def forward(
-            self,
-            hidden_states,
-            timestep: Union[paddle.Tensor, float, int],
-            proj_embedding: paddle.Tensor,
-            encoder_hidden_states: paddle.Tensor,
-            attention_mask: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ):
+        self,
+        hidden_states,
+        timestep: Union[paddle.Tensor, float, int],
+        proj_embedding: paddle.Tensor,
+        encoder_hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ):
         """
         Args:
             hidden_states (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
@@ -168,8 +179,7 @@ def forward(
             timesteps = timesteps[None]
 
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps * paddle.ones(
-            (batch_size, ), dtype=timesteps.dtype)
+        timesteps = timesteps * paddle.ones((batch_size,), dtype=timesteps.dtype)
 
         timesteps_projected = self.time_proj(timesteps)
 
@@ -179,13 +189,10 @@ def forward(
         time_embeddings = self.time_embedding(timesteps_projected)
 
         proj_embeddings = self.embedding_proj(proj_embedding)
-        encoder_hidden_states = self.encoder_hidden_states_proj(
-            encoder_hidden_states)
+        encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
         hidden_states = self.proj_in(hidden_states)
-        prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand(
-            [batch_size, -1, -1])
-        positional_embeddings = self.positional_embedding.cast(
-            hidden_states.dtype)
+        prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand([batch_size, -1, -1])
+        positional_embeddings = self.positional_embedding.cast(hidden_states.dtype)
 
         hidden_states = paddle.concat(
             [
@@ -195,23 +202,21 @@ def forward(
                 hidden_states[:, None, :],
                 prd_embedding,
             ],
-            axis=1, )
+            axis=1,
+        )
 
         hidden_states = hidden_states + positional_embeddings
 
         if attention_mask is not None:
-            attention_mask = (
-                1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF
+            attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF
             attention_mask = F.pad(
                 attention_mask.unsqueeze(0),
                 (0, self.additional_embeddings),
                 value=0.0,
-                data_format="NCL", ).squeeze(0)
-            attention_mask = (
-                attention_mask[:, None, :] + self.causal_attention_mask
-            ).cast(hidden_states.dtype)
-            attention_mask = attention_mask.repeat_interleave(
-                self.config.num_attention_heads, axis=0)
+                data_format="NCL",
+            ).squeeze(0)
+            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).cast(hidden_states.dtype)
+            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, axis=0)
 
         for block in self.transformer_blocks:
             hidden_states = block(hidden_states, attention_mask=attention_mask)
@@ -221,10 +226,9 @@ def forward(
         predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
 
         if not return_dict:
-            return (predicted_image_embedding, )
+            return (predicted_image_embedding,)
 
-        return PriorTransformerOutput(
-            predicted_image_embedding=predicted_image_embedding)
+        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
 
     def post_process_latents(self, prior_latents):
         prior_latents = (prior_latents * self.clip_std) + self.clip_mean
diff --git a/ppdiffusers/ppdiffusers/models/resnet.py b/ppdiffusers/ppdiffusers/models/resnet.py
index 60998dc3fc1b7..39bf23c59264d 100644
--- a/ppdiffusers/ppdiffusers/models/resnet.py
+++ b/ppdiffusers/ppdiffusers/models/resnet.py
@@ -37,12 +37,13 @@ class Upsample1D(nn.Layer):
     """
 
     def __init__(
-            self,
-            channels,
-            use_conv=False,
-            use_conv_transpose=False,
-            out_channels=None,
-            name="conv", ):
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -54,8 +55,7 @@ def __init__(
         if use_conv_transpose:
             self.conv = nn.Conv1DTranspose(channels, self.out_channels, 4, 2, 1)
         elif use_conv:
-            self.conv = nn.Conv1D(
-                self.channels, self.out_channels, 3, padding=1)
+            self.conv = nn.Conv1D(self.channels, self.out_channels, 3, padding=1)
 
     def forward(self, x):
         assert x.shape[1] == self.channels
@@ -81,12 +81,7 @@ class Downsample1D(nn.Layer):
         padding:
     """
 
-    def __init__(self,
-                 channels,
-                 use_conv=False,
-                 out_channels=None,
-                 padding=1,
-                 name="conv"):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -96,12 +91,7 @@ def __init__(self,
         self.name = name
 
         if use_conv:
-            self.conv = nn.Conv1D(
-                self.channels,
-                self.out_channels,
-                3,
-                stride=stride,
-                padding=padding)
+            self.conv = nn.Conv1D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
         else:
             assert self.channels == self.out_channels
             self.conv = nn.AvgPool1D(kernel_size=stride, stride=stride)
@@ -123,12 +113,13 @@ class Upsample2D(nn.Layer):
     """
 
     def __init__(
-            self,
-            channels,
-            use_conv=False,
-            use_conv_transpose=False,
-            out_channels=None,
-            name="conv", ):
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -164,11 +155,9 @@ def forward(self, hidden_states, output_size=None):
         # if `output_size` is passed we force the interpolation output
         # size and do not make use of `scale_factor=2`
         if output_size is None:
-            hidden_states = F.interpolate(
-                hidden_states, scale_factor=2.0, mode="nearest")
+            hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
         else:
-            hidden_states = F.interpolate(
-                hidden_states, size=output_size, mode="nearest")
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
 
         # If the input is bfloat16, we cast back to bfloat16
         if dtype == paddle.bfloat16:
@@ -195,12 +184,7 @@ class Downsample2D(nn.Layer):
         padding:
     """
 
-    def __init__(self,
-                 channels,
-                 use_conv=False,
-                 out_channels=None,
-                 padding=1,
-                 name="conv"):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -210,12 +194,7 @@ def __init__(self,
         self.name = name
 
         if use_conv:
-            conv = nn.Conv2D(
-                self.channels,
-                self.out_channels,
-                3,
-                stride=stride,
-                padding=padding)
+            conv = nn.Conv2D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
         else:
             assert self.channels == self.out_channels
             conv = nn.AvgPool2D(kernel_size=stride, stride=stride)
@@ -242,26 +221,16 @@ def forward(self, hidden_states):
 
 
 class FirUpsample2D(nn.Layer):
-    def __init__(self,
-                 channels=None,
-                 out_channels=None,
-                 use_conv=False,
-                 fir_kernel=(1, 3, 3, 1)):
+    def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
         super().__init__()
         out_channels = out_channels if out_channels else channels
         if use_conv:
-            self.Conv2d_0 = nn.Conv2D(
-                channels, out_channels, kernel_size=3, stride=1, padding=1)
+            self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1)
         self.use_conv = use_conv
         self.fir_kernel = fir_kernel
         self.out_channels = out_channels
 
-    def _upsample_2d(self,
-                     hidden_states,
-                     weight=None,
-                     kernel=None,
-                     factor=2,
-                     gain=1):
+    def _upsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
         """Fused `upsample_2d()` followed by `Conv2d()`.
 
         Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
@@ -307,12 +276,12 @@ def _upsample_2d(self,
             # Determine data dimensions.
             output_shape = (
                 (hidden_states.shape[2] - 1) * factor + convH,
-                (hidden_states.shape[3] - 1) * factor + convW, )
+                (hidden_states.shape[3] - 1) * factor + convW,
+            )
             output_padding = (
-                output_shape[0] -
-                (hidden_states.shape[2] - 1) * stride[0] - convH,
-                output_shape[1] -
-                (hidden_states.shape[3] - 1) * stride[1] - convW, )
+                output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
+                output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
+            )
             assert output_padding[0] >= 0 and output_padding[1] >= 0
             num_groups = hidden_states.shape[1] // inC
 
@@ -326,55 +295,46 @@ def _upsample_2d(self,
                 weight,
                 stride=stride,
                 output_padding=output_padding,
-                padding=0, )
+                padding=0,
+            )
 
             output = upfirdn2d_native(
                 inverse_conv,
                 paddle.to_tensor(kernel),
-                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1), )
+                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
+            )
         else:
             pad_value = kernel.shape[0] - factor
             output = upfirdn2d_native(
                 hidden_states,
                 paddle.to_tensor(kernel),
                 up=factor,
-                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), )
+                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+            )
 
         return output
 
     def forward(self, hidden_states):
         if self.use_conv:
-            height = self._upsample_2d(
-                hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
+            height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
             height = height + self.Conv2d_0.bias.reshape([1, -1, 1, 1])
         else:
-            height = self._upsample_2d(
-                hidden_states, kernel=self.fir_kernel, factor=2)
+            height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
 
         return height
 
 
 class FirDownsample2D(nn.Layer):
-    def __init__(self,
-                 channels=None,
-                 out_channels=None,
-                 use_conv=False,
-                 fir_kernel=(1, 3, 3, 1)):
+    def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
         super().__init__()
         out_channels = out_channels if out_channels else channels
         if use_conv:
-            self.Conv2d_0 = nn.Conv2D(
-                channels, out_channels, kernel_size=3, stride=1, padding=1)
+            self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1)
         self.fir_kernel = fir_kernel
         self.use_conv = use_conv
         self.out_channels = out_channels
 
-    def _downsample_2d(self,
-                       hidden_states,
-                       weight=None,
-                       kernel=None,
-                       factor=2,
-                       gain=1):
+    def _downsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
         """Fused `Conv2d()` followed by `downsample_2d()`.
         Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
         efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
@@ -414,30 +374,26 @@ def _downsample_2d(self,
             upfirdn_input = upfirdn2d_native(
                 hidden_states,
                 paddle.to_tensor(kernel),
-                pad=((pad_value + 1) // 2, pad_value // 2), )
-            output = F.conv2d(
-                upfirdn_input, weight, stride=stride_value, padding=0)
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+            output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
         else:
             pad_value = kernel.shape[0] - factor
             output = upfirdn2d_native(
                 hidden_states,
                 paddle.to_tensor(kernel),
                 down=factor,
-                pad=((pad_value + 1) // 2, pad_value // 2), )
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
 
         return output
 
     def forward(self, hidden_states):
         if self.use_conv:
-            downsample_input = self._downsample_2d(
-                hidden_states,
-                weight=self.Conv2d_0.weight,
-                kernel=self.fir_kernel)
-            hidden_states = downsample_input + self.Conv2d_0.bias.reshape(
-                [1, -1, 1, 1])
+            downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
+            hidden_states = downsample_input + self.Conv2d_0.bias.reshape([1, -1, 1, 1])
         else:
-            hidden_states = self._downsample_2d(
-                hidden_states, kernel=self.fir_kernel, factor=2)
+            hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
 
         return hidden_states
 
@@ -451,18 +407,16 @@ def __init__(self, pad_mode="reflect"):
         self.pad = kernel_1d.shape[1] // 2 - 1
         self.register_buffer(
             "kernel",
-            paddle.matmul(
-                kernel_1d, kernel_1d, transpose_x=True),
-            persistable=False, )
+            paddle.matmul(kernel_1d, kernel_1d, transpose_x=True),
+            persistable=False,
+        )
 
     def forward(self, x):
-        x = F.pad(x, (self.pad, ) * 4, self.pad_mode)
+        x = F.pad(x, (self.pad,) * 4, self.pad_mode)
         weight = paddle.zeros(
-            [
-                x.shape[1], x.shape[1], self.kernel.shape[0],
-                self.kernel.shape[1]
-            ],
-            dtype=x.dtype, )
+            [x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]],
+            dtype=x.dtype,
+        )
         indices = paddle.arange(x.shape[1])
         # TODO verify this method
         weight[indices, indices] = self.kernel.cast(weight.dtype)
@@ -477,18 +431,16 @@ def __init__(self, pad_mode="reflect"):
         self.pad = kernel_1d.shape[1] // 2 - 1
         self.register_buffer(
             "kernel",
-            paddle.matmul(
-                kernel_1d, kernel_1d, transpose_x=True),
-            persistable=False, )
+            paddle.matmul(kernel_1d, kernel_1d, transpose_x=True),
+            persistable=False,
+        )
 
     def forward(self, x):
-        x = F.pad(x, ((self.pad + 1) // 2, ) * 4, self.pad_mode)
+        x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode)
         weight = paddle.zeros(
-            [
-                x.shape[1], x.shape[1], self.kernel.shape[0],
-                self.kernel.shape[1]
-            ],
-            dtype=x.dtype, )
+            [x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]],
+            dtype=x.dtype,
+        )
         indices = paddle.arange(x.shape[1])
         # TODO verify this method
         weight[indices, indices] = self.kernel.cast(weight.dtype)
@@ -527,28 +479,28 @@ class ResnetBlock2D(nn.Layer):
     """
 
     def __init__(
-            self,
-            *,
-            in_channels,
-            out_channels=None,
-            conv_shortcut=False,
-            dropout=0.0,
-            temb_channels=512,
-            groups=32,
-            groups_out=None,
-            pre_norm=True,
-            eps=1e-6,
-            non_linearity="swish",
-            skip_time_act: bool=False,  # skip_time_act is the same as pre_temb_non_linearity
-            time_embedding_norm="default",  # default, scale_shift, ada_group
-            kernel=None,
-            output_scale_factor=1.0,
-            use_in_shortcut=None,
-            up=False,
-            down=False,
-            conv_shortcut_bias: bool=True,
-            conv_2d_out_channels: Optional[int]=None,
-            pre_temb_non_linearity: bool=False,  # skip_time_act is the same as pre_temb_non_linearity
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        skip_time_act: bool = False,  # skip_time_act is the same as pre_temb_non_linearity
+        time_embedding_norm="default",  # default, scale_shift, ada_group
+        kernel=None,
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+        up=False,
+        down=False,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+        pre_temb_non_linearity: bool = False,  # skip_time_act is the same as pre_temb_non_linearity
     ):
         super().__init__()
         self.pre_temb_non_linearity = pre_temb_non_linearity
@@ -568,14 +520,11 @@ def __init__(
             groups_out = groups
 
         if self.time_embedding_norm == "ada_group":
-            self.norm1 = AdaGroupNorm(
-                temb_channels, in_channels, groups, eps=eps)
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
         else:
-            self.norm1 = nn.GroupNorm(
-                num_groups=groups, num_channels=in_channels, epsilon=eps)
+            self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps)
 
-        self.conv1 = nn.Conv2D(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv1 = nn.Conv2D(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
 
         if temb_channels is not None:
             if self.time_embedding_norm == "default":
@@ -585,26 +534,18 @@ def __init__(
             elif self.time_embedding_norm == "ada_group":
                 self.time_emb_proj = None
             else:
-                raise ValueError(
-                    f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
         else:
             self.time_emb_proj = None
 
         if self.time_embedding_norm == "ada_group":
-            self.norm2 = AdaGroupNorm(
-                temb_channels, out_channels, groups_out, eps=eps)
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
         else:
-            self.norm2 = nn.GroupNorm(
-                num_groups=groups_out, num_channels=out_channels, epsilon=eps)
+            self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps)
 
         self.dropout = nn.Dropout(dropout)
         conv_2d_out_channels = conv_2d_out_channels or out_channels
-        self.conv2 = nn.Conv2D(
-            out_channels,
-            conv_2d_out_channels,
-            kernel_size=3,
-            stride=1,
-            padding=1)
+        self.conv2 = nn.Conv2D(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
 
         if non_linearity == "swish":
             self.nonlinearity = lambda x: F.silu(x)
@@ -621,8 +562,7 @@ def __init__(
                 fir_kernel = (1, 3, 3, 1)
                 self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
             elif kernel == "sde_vp":
-                self.upsample = partial(
-                    F.interpolate, scale_factor=2.0, mode="nearest")
+                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
             else:
                 self.upsample = Upsample2D(in_channels, use_conv=False)
         elif self.down:
@@ -632,11 +572,9 @@ def __init__(
             elif kernel == "sde_vp":
                 self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
             else:
-                self.downsample = Downsample2D(
-                    in_channels, use_conv=False, padding=1, name="op")
+                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
 
-        self.use_in_shortcut = (self.in_channels != conv_2d_out_channels
-                                if use_in_shortcut is None else use_in_shortcut)
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
 
         self.conv_shortcut = None
         if self.use_in_shortcut:
@@ -646,7 +584,8 @@ def __init__(
                 kernel_size=1,
                 stride=1,
                 padding=0,
-                bias_attr=conv_shortcut_bias, )
+                bias_attr=conv_shortcut_bias,
+            )
 
     def forward(self, input_tensor, temb):
         hidden_states = input_tensor
@@ -693,8 +632,7 @@ def forward(self, input_tensor, temb):
             input_tensor = self.conv_shortcut(input_tensor)
 
         # TODO this maybe result -inf, input_tensor's min value -57644  hidden_states's min value -10000
-        output_tensor = (
-            input_tensor + hidden_states) / self.output_scale_factor
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
 
         return output_tensor
 
@@ -724,8 +662,7 @@ class Conv1dBlock(nn.Layer):
     def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
         super().__init__()
 
-        self.conv1d = nn.Conv1D(
-            inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.conv1d = nn.Conv1D(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
         self.group_norm = nn.GroupNorm(n_groups, out_channels)
         self.mish = nn.Mish()
 
@@ -748,8 +685,9 @@ def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5):
         self.time_emb_act = nn.Mish()
         self.time_emb = nn.Linear(embed_dim, out_channels)
 
-        self.residual_conv = (nn.Conv1D(inp_channels, out_channels, 1) if
-                              inp_channels != out_channels else nn.Identity())
+        self.residual_conv = (
+            nn.Conv1D(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
+        )
 
     def forward(self, x, t):
         """
@@ -799,7 +737,8 @@ def upsample_2d(hidden_states, kernel=None, factor=2, gain=1):
         hidden_states,
         kernel,
         up=factor,
-        pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), )
+        pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+    )
     return output
 
 
@@ -832,11 +771,7 @@ def downsample_2d(hidden_states, kernel=None, factor=2, gain=1):
 
     kernel = kernel * gain
     pad_value = kernel.shape[0] - factor
-    output = upfirdn2d_native(
-        hidden_states,
-        kernel,
-        down=factor,
-        pad=((pad_value + 1) // 2, pad_value // 2))
+    output = upfirdn2d_native(hidden_states, kernel, down=factor, pad=((pad_value + 1) // 2, pad_value // 2))
     return output
 
 
@@ -854,9 +789,11 @@ def dummy_pad(tensor, up_x=0, up_y=0):
                         up_x,
                         tensor.shape[5],
                     ],
-                    dtype=tensor.dtype, ),
+                    dtype=tensor.dtype,
+                ),
             ],
-            axis=4, )
+            axis=4,
+        )
     if up_y > 0:
         tensor = paddle.concat(
             [
@@ -870,9 +807,11 @@ def dummy_pad(tensor, up_x=0, up_y=0):
                         tensor.shape[4],
                         tensor.shape[5],
                     ],
-                    dtype=tensor.dtype, ),
+                    dtype=tensor.dtype,
+                ),
             ],
-            axis=2, )
+            axis=2,
+        )
     return tensor
 
 
@@ -900,23 +839,29 @@ def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
     out = F.pad(
         out,
         [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0), 0, 0],
-        data_format="NDHWC", )
+        data_format="NDHWC",
+    )
     out = out.squeeze(0)
 
-    out = out[:, max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0), max(
-        -pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ]
+    out = out[
+        :,
+        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
+        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
+        :,
+    ]
 
     out = out.transpose([0, 3, 1, 2])
-    out = out.reshape(
-        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
     w = paddle.flip(kernel, [0, 1]).reshape([1, 1, kernel_h, kernel_w])
     out = F.conv2d(out, w)
-    out = out.reshape([
-        -1,
-        minor,
-        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
-        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
-    ])
+    out = out.reshape(
+        [
+            -1,
+            minor,
+            in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+            in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+        ]
+    )
     out = out.transpose([0, 2, 3, 1])
     out = out[:, ::down_y, ::down_x, :]
 
@@ -938,44 +883,48 @@ def __init__(self, in_dim, out_dim=None, dropout=0.0):
         self.in_dim = in_dim
         self.out_dim = out_dim
         self.conv1 = nn.Sequential(
-            nn.GroupNorm(
-                num_groups=32, num_channels=in_dim),
+            nn.GroupNorm(num_groups=32, num_channels=in_dim),
             nn.Silu(),
             nn.Conv3D(
                 in_channels=in_dim,
                 out_channels=out_dim,
                 kernel_size=(3, 1, 1),
-                padding=(1, 0, 0), ), )
+                padding=(1, 0, 0),
+            ),
+        )
         self.conv2 = nn.Sequential(
-            nn.GroupNorm(
-                num_groups=32, num_channels=out_dim),
+            nn.GroupNorm(num_groups=32, num_channels=out_dim),
             nn.Silu(),
             nn.Dropout(p=dropout),
             nn.Conv3D(
                 in_channels=out_dim,
                 out_channels=in_dim,
                 kernel_size=(3, 1, 1),
-                padding=(1, 0, 0), ), )
+                padding=(1, 0, 0),
+            ),
+        )
         self.conv3 = nn.Sequential(
-            nn.GroupNorm(
-                num_groups=32, num_channels=out_dim),
+            nn.GroupNorm(num_groups=32, num_channels=out_dim),
             nn.Silu(),
             nn.Dropout(p=dropout),
             nn.Conv3D(
                 in_channels=out_dim,
                 out_channels=in_dim,
                 kernel_size=(3, 1, 1),
-                padding=(1, 0, 0), ), )
+                padding=(1, 0, 0),
+            ),
+        )
         self.conv4 = nn.Sequential(
-            nn.GroupNorm(
-                num_groups=32, num_channels=out_dim),
+            nn.GroupNorm(num_groups=32, num_channels=out_dim),
             nn.Silu(),
             nn.Dropout(p=dropout),
             nn.Conv3D(
                 in_channels=out_dim,
                 out_channels=in_dim,
                 kernel_size=(3, 1, 1),
-                padding=(1, 0, 0), ), )
+                padding=(1, 0, 0),
+            ),
+        )
         zeros_(self.conv4[-1].weight)
         zeros_(self.conv4[-1].bias)
 
@@ -983,14 +932,15 @@ def forward(self, hidden_states, num_frames=1):
         hidden_states = (
             hidden_states[None, :]
             .reshape((-1, num_frames) + tuple(hidden_states.shape[1:]))
-            .transpose(perm=[0, 2, 1, 3, 4]))
+            .transpose(perm=[0, 2, 1, 3, 4])
+        )
         identity = hidden_states
         hidden_states = self.conv1(hidden_states)
         hidden_states = self.conv2(hidden_states)
         hidden_states = self.conv3(hidden_states)
         hidden_states = self.conv4(hidden_states)
         hidden_states = identity + hidden_states
-        hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape((
-            hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(
-                hidden_states.shape[3:]))
+        hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape(
+            (hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(hidden_states.shape[3:])
+        )
         return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
index fabe9f4eaec86..2d0a45bcc46c9 100644
--- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
@@ -26,31 +26,30 @@
 class T5FilmDecoder(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
-            self,
-            input_dims: int=128,
-            targets_length: int=256,
-            max_decoder_noise_time: float=2000.0,
-            d_model: int=768,
-            num_layers: int=12,
-            num_heads: int=12,
-            d_kv: int=64,
-            d_ff: int=2048,
-            dropout_rate: float=0.1, ):
+        self,
+        input_dims: int = 128,
+        targets_length: int = 256,
+        max_decoder_noise_time: float = 2000.0,
+        d_model: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        d_kv: int = 64,
+        d_ff: int = 2048,
+        dropout_rate: float = 0.1,
+    ):
         super().__init__()
 
         self.conditioning_emb = nn.Sequential(
-            nn.Linear(
-                d_model, d_model * 4, bias_attr=False),
+            nn.Linear(d_model, d_model * 4, bias_attr=False),
             nn.Silu(),
-            nn.Linear(
-                d_model * 4, d_model * 4, bias_attr=False),
-            nn.Silu(), )
+            nn.Linear(d_model * 4, d_model * 4, bias_attr=False),
+            nn.Silu(),
+        )
 
         self.position_encoding = nn.Embedding(targets_length, d_model)
         self.position_encoding.weight.stop_gradient = True
 
-        self.continuous_inputs_projection = nn.Linear(
-            input_dims, d_model, bias_attr=False)
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias_attr=False)
 
         self.dropout = nn.Dropout(p=dropout_rate)
 
@@ -62,7 +61,8 @@ def __init__(
                 d_kv=d_kv,
                 num_heads=num_heads,
                 d_ff=d_ff,
-                dropout_rate=dropout_rate, )
+                dropout_rate=dropout_rate,
+            )
             self.decoders.append(lyr)
 
         self.decoder_norm = T5LayerNorm(d_model)
@@ -71,13 +71,10 @@ def __init__(
         self.spec_out = nn.Linear(d_model, input_dims, bias_attr=False)
 
     def encoder_decoder_mask(self, query_input, key_input):
-        mask = paddle.multiply(
-            query_input.unsqueeze(-1),
-            key_input.unsqueeze(-2).cast(query_input.dtype))
+        mask = paddle.multiply(query_input.unsqueeze(-1), key_input.unsqueeze(-2).cast(query_input.dtype))
         return mask.unsqueeze(-3)
 
-    def forward(self, encodings_and_masks, decoder_input_tokens,
-                decoder_noise_time):
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
         batch, _, _ = decoder_input_tokens.shape
         assert decoder_noise_time.shape[0] == batch
 
@@ -85,7 +82,8 @@ def forward(self, encodings_and_masks, decoder_input_tokens,
         time_steps = get_timestep_embedding(
             decoder_noise_time * self.config.max_decoder_noise_time,
             embedding_dim=self.config.d_model,
-            max_period=self.config.max_decoder_noise_time, ).cast(self.dtype)
+            max_period=self.config.max_decoder_noise_time,
+        ).cast(self.dtype)
 
         conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
 
@@ -96,37 +94,34 @@ def forward(self, encodings_and_masks, decoder_input_tokens,
         # If we want to use relative positions for audio context, we can just offset
         # this sequence by the length of encodings_and_masks.
         decoder_positions = paddle.broadcast_to(
-            paddle.arange(seq_length, ),
-            shape=(batch, seq_length), )
+            paddle.arange(
+                seq_length,
+            ),
+            shape=(batch, seq_length),
+        )
 
         position_encodings = self.position_encoding(decoder_positions)
-        inputs = self.continuous_inputs_projection(
-            decoder_input_tokens.cast(position_encodings.dtype))
+        inputs = self.continuous_inputs_projection(decoder_input_tokens.cast(position_encodings.dtype))
         inputs += position_encodings
         y = self.dropout(inputs)
 
         # decoder: No padding present.
-        decoder_mask = paddle.ones(
-            decoder_input_tokens.shape[:2], dtype=inputs.dtype)
+        decoder_mask = paddle.ones(decoder_input_tokens.shape[:2], dtype=inputs.dtype)
 
         # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [
-            (x, self.encoder_decoder_mask(decoder_mask, y))
-            for x, y in encodings_and_masks
-        ]
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
 
         # cross attend style: concat encodings
-        encoded = paddle.concat(
-            [x[0] for x in encodings_and_encdec_masks], axis=1)
-        encoder_decoder_mask = paddle.concat(
-            [x[1] for x in encodings_and_encdec_masks], axis=-1)
+        encoded = paddle.concat([x[0] for x in encodings_and_encdec_masks], axis=1)
+        encoder_decoder_mask = paddle.concat([x[1] for x in encodings_and_encdec_masks], axis=-1)
 
         for lyr in self.decoders:
             y = lyr(
                 y,
                 conditioning_emb=conditioning_emb,
                 encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask, )[0]
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
 
         y = self.decoder_norm(y)
         y = self.post_dropout(y)
@@ -136,13 +131,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens,
 
 
 class DecoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model,
-                 d_kv,
-                 num_heads,
-                 d_ff,
-                 dropout_rate,
-                 layer_norm_epsilon=1e-6):
+    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
         super().__init__()
         self.layer = nn.LayerList()
 
@@ -152,7 +141,9 @@ def __init__(self,
                 d_model=d_model,
                 d_kv=d_kv,
                 num_heads=num_heads,
-                dropout_rate=dropout_rate, ))
+                dropout_rate=dropout_rate,
+            )
+        )
 
         # cross attention: layer 1
         self.layer.append(
@@ -161,7 +152,9 @@ def __init__(self,
                 d_kv=d_kv,
                 num_heads=num_heads,
                 dropout_rate=dropout_rate,
-                layer_norm_epsilon=layer_norm_epsilon, ))
+                layer_norm_epsilon=layer_norm_epsilon,
+            )
+        )
 
         # Film Cond MLP + dropout: last layer
         self.layer.append(
@@ -169,62 +162,67 @@ def __init__(self,
                 d_model=d_model,
                 d_ff=d_ff,
                 dropout_rate=dropout_rate,
-                layer_norm_epsilon=layer_norm_epsilon, ))
+                layer_norm_epsilon=layer_norm_epsilon,
+            )
+        )
 
     def forward(
-            self,
-            hidden_states,
-            conditioning_emb=None,
-            attention_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            encoder_decoder_position_bias=None, ):
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+    ):
         hidden_states = self.layer[0](
             hidden_states,
             conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask, )
+            attention_mask=attention_mask,
+        )
 
         if encoder_hidden_states is not None:
-            encoder_extended_attention_mask = paddle.where(
-                encoder_attention_mask > 0, 0.0,
-                -1e10).cast(encoder_hidden_states.dtype)
+            encoder_extended_attention_mask = paddle.where(encoder_attention_mask > 0, 0.0, -1e10).cast(
+                encoder_hidden_states.dtype
+            )
 
             hidden_states = self.layer[1](
                 hidden_states,
                 key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask, )
+                attention_mask=encoder_extended_attention_mask,
+            )
 
         # Apply Film Conditional Feed Forward layer
         hidden_states = self.layer[-1](hidden_states, conditioning_emb)
 
-        return (hidden_states, )
+        return (hidden_states,)
 
 
 class T5LayerSelfAttentionCond(nn.Layer):
     def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
         self.layer_norm = T5LayerNorm(d_model)
-        self.FiLMLayer = T5FiLMLayer(
-            in_features=d_model * 4, out_features=d_model)
+        self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
         self.attention = Attention(
             query_dim=d_model,
             heads=num_heads,
             dim_head=d_kv,
             out_bias=False,
-            scale_qk=False, )
+            scale_qk=False,
+        )
         self.dropout = nn.Dropout(dropout_rate)
 
     def forward(
-            self,
-            hidden_states,
-            conditioning_emb=None,
-            attention_mask=None, ):
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+    ):
         # pre_self_attention_layer_norm
         normed_hidden_states = self.layer_norm(hidden_states)
 
         if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states,
-                                                  conditioning_emb)
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
 
         # Self-attention block
         attention_output = self.attention(normed_hidden_states)
@@ -235,28 +233,30 @@ def forward(
 
 
 class T5LayerCrossAttention(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate,
-                 layer_norm_epsilon):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
         super().__init__()
         self.attention = Attention(
             query_dim=d_model,
             heads=num_heads,
             dim_head=d_kv,
             out_bias=False,
-            scale_qk=False, )
+            scale_qk=False,
+        )
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
         self.dropout = nn.Dropout(dropout_rate)
 
     def forward(
-            self,
-            hidden_states,
-            key_value_states=None,
-            attention_mask=None, ):
+        self,
+        hidden_states,
+        key_value_states=None,
+        attention_mask=None,
+    ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.attention(
             normed_hidden_states,
             encoder_hidden_states=key_value_states,
-            attention_mask=attention_mask.squeeze(1), )
+            attention_mask=attention_mask.squeeze(1),
+        )
         layer_output = hidden_states + self.dropout(attention_output)
         return layer_output
 
@@ -264,8 +264,7 @@ def forward(
 class T5LayerFFCond(nn.Layer):
     def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
         super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(
-            d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
         self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
         self.dropout = nn.Dropout(dropout_rate)
@@ -306,9 +305,7 @@ class T5LayerNorm(nn.Layer):
 
     def __init__(self, hidden_size, eps=1e-6):
         super().__init__()
-        self.weight = self.create_parameter(
-            shape=[hidden_size],
-            default_initializer=nn.initializer.Constant(1.0))
+        self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
@@ -317,10 +314,8 @@ def forward(self, hidden_states):
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
-        variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean(
-            axis=-1, keepdim=True)
-        hidden_states = hidden_states * paddle.rsqrt(variance +
-                                                     self.variance_epsilon)
+        variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean(axis=-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
 
         # convert into half-precision if necessary
         if self.weight.dtype == paddle.float16:
@@ -335,9 +330,9 @@ class NewGELUActivation(nn.Layer):
     """
 
     def forward(self, input: paddle.Tensor) -> paddle.Tensor:
-        return (0.5 * input * (1.0 + paddle.tanh(
-            math.sqrt(2.0 / math.pi) *
-            (input + 0.044715 * paddle.pow(input, 3.0)))))
+        return (
+            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+        )
 
 
 class T5FiLMLayer(nn.Layer):
@@ -347,8 +342,7 @@ class T5FiLMLayer(nn.Layer):
 
     def __init__(self, in_features, out_features):
         super().__init__()
-        self.scale_bias = nn.Linear(
-            in_features, out_features * 2, bias_attr=False)
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias_attr=False)
 
     def forward(self, x, conditioning_emb):
         emb = self.scale_bias(conditioning_emb)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py
index e9f47cbee3f7b..2207b8b46974e 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_2d.py
@@ -79,26 +79,27 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_attention_heads: int=16,
-            attention_head_dim: int=88,
-            in_channels: Optional[int]=None,
-            out_channels: Optional[int]=None,
-            num_layers: int=1,
-            dropout: float=0.0,
-            norm_num_groups: int=32,
-            cross_attention_dim: Optional[int]=None,
-            attention_bias: bool=False,
-            sample_size: Optional[int]=None,
-            num_vector_embeds: Optional[int]=None,
-            patch_size: Optional[int]=None,
-            activation_fn: str="geglu",
-            num_embeds_ada_norm: Optional[int]=None,
-            use_linear_projection: bool=False,
-            only_cross_attention: bool=False,
-            upcast_attention: bool=False,
-            norm_type: str="layer_norm",
-            norm_elementwise_affine: bool=True, ):
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+    ):
         super().__init__()
         self.use_linear_projection = use_linear_projection
         self.num_attention_heads = num_attention_heads
@@ -107,8 +108,7 @@ def __init__(
 
         # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
         # Define whether input is continuous or discrete depending on configuration
-        self.is_input_continuous = (in_channels is not None) and (
-            patch_size is None)
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
         self.is_input_vectorized = num_vector_embeds is not None
         self.is_input_patches = in_channels is not None and patch_size is not None
 
@@ -124,7 +124,8 @@ def __init__(
                 "norm_type!=num_embeds_ada_norm",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             norm_type = "ada_norm"
 
         if self.is_input_continuous and self.is_input_vectorized:
@@ -137,8 +138,7 @@ def __init__(
                 f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
                 " sure that either `num_vector_embeds` or `num_patches` is None."
             )
-        elif (not self.is_input_continuous and not self.is_input_vectorized and
-              not self.is_input_patches):
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
             raise ValueError(
                 f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
                 f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
@@ -148,22 +148,14 @@ def __init__(
         if self.is_input_continuous:
             self.in_channels = in_channels
 
-            self.norm = nn.GroupNorm(
-                num_groups=norm_num_groups,
-                num_channels=in_channels,
-                epsilon=1e-6)
+            self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-6)
             if use_linear_projection:
                 self.proj_in = nn.Linear(in_channels, inner_dim)
             else:
-                self.proj_in = nn.Conv2D(
-                    in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+                self.proj_in = nn.Conv2D(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
         elif self.is_input_vectorized:
-            assert (
-                sample_size is not None
-            ), "Transformer2DModel over discrete input must provide sample_size"
-            assert (
-                num_vector_embeds is not None
-            ), "Transformer2DModel over discrete input must provide num_embed"
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
 
             self.height = sample_size
             self.width = sample_size
@@ -174,11 +166,10 @@ def __init__(
                 num_embed=num_vector_embeds,
                 embed_dim=inner_dim,
                 height=self.height,
-                width=self.width, )
+                width=self.width,
+            )
         elif self.is_input_patches:
-            assert (
-                sample_size is not None
-            ), "Transformer2DModel over patched input must provide sample_size"
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
 
             self.height = sample_size
             self.width = sample_size
@@ -189,25 +180,29 @@ def __init__(
                 width=sample_size,
                 patch_size=patch_size,
                 in_channels=in_channels,
-                embed_dim=inner_dim, )
+                embed_dim=inner_dim,
+            )
 
         # 3. Define transformers blocks
-        self.transformer_blocks = nn.LayerList([
-            BasicTransformerBlock(
-                inner_dim,
-                num_attention_heads,
-                attention_head_dim,
-                dropout=dropout,
-                cross_attention_dim=cross_attention_dim,
-                activation_fn=activation_fn,
-                num_embeds_ada_norm=num_embeds_ada_norm,
-                attention_bias=attention_bias,
-                only_cross_attention=only_cross_attention,
-                upcast_attention=upcast_attention,
-                norm_type=norm_type,
-                norm_elementwise_affine=norm_elementwise_affine, )
-            for d in range(num_layers)
-        ])
+        self.transformer_blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                )
+                for d in range(num_layers)
+            ]
+        )
 
         # 4. Define output layers
         self.out_channels = in_channels if out_channels is None else out_channels
@@ -216,8 +211,7 @@ def __init__(
             if use_linear_projection:
                 self.proj_out = nn.Linear(inner_dim, in_channels)
             else:
-                self.proj_out = nn.Conv2D(
-                    inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+                self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
         elif self.is_input_vectorized:
             self.norm_out = nn.LayerNorm(inner_dim)
             self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
@@ -226,17 +220,17 @@ def __init__(
             norm_kwargs = {"weight_attr": False, "bias_attr": False}
             self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
             self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
-            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size *
-                                        self.out_channels)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
 
     def forward(
-            self,
-            hidden_states,
-            encoder_hidden_states=None,
-            timestep=None,
-            class_labels=None,
-            cross_attention_kwargs=None,
-            return_dict: bool=True, ):
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        timestep=None,
+        class_labels=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+    ):
         """
         Args:
             hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
@@ -270,8 +264,7 @@ def forward(
             if self.use_linear_projection:
                 hidden_states = self.proj_in(hidden_states)
         elif self.is_input_vectorized:
-            hidden_states = self.latent_image_embedding(
-                hidden_states.cast("int64"))
+            hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
         elif self.is_input_patches:
             hidden_states = self.pos_embed(hidden_states)
 
@@ -282,14 +275,14 @@ def forward(
                 encoder_hidden_states=encoder_hidden_states,
                 timestep=timestep,
                 cross_attention_kwargs=cross_attention_kwargs,
-                class_labels=class_labels, )
+                class_labels=class_labels,
+            )
 
         # 3. Output
         if self.is_input_continuous:
             if self.use_linear_projection:
                 hidden_states = self.proj_out(hidden_states)
-            hidden_states = hidden_states.reshape(
-                [-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
+            hidden_states = hidden_states.reshape([-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
             if not self.use_linear_projection:
                 hidden_states = self.proj_out(hidden_states)
             output = hidden_states + residual
@@ -300,31 +293,32 @@ def forward(
             logits = logits.transpose([0, 2, 1])
 
             # log(p(x_0))
-            output = F.log_softmax(
-                logits.cast("float64"), axis=1).cast("float32")
+            output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
         elif self.is_input_patches:
             # TODO: cleanup!
             conditioning = self.transformer_blocks[0].norm1.emb(
-                timestep, class_labels, hidden_dtype=hidden_states.dtype)
-            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(
-                2, axis=1)
-            hidden_states = (self.norm_out(hidden_states) *
-                             (1 + scale[:, None]) + shift[:, None])
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
             hidden_states = self.proj_out_2(hidden_states)
 
             # unpatchify
-            height = width = int(hidden_states.shape[1]**0.5)
+            height = width = int(hidden_states.shape[1] ** 0.5)
             hidden_states = hidden_states.reshape(
-                (-1, height, width, self.patch_size, self.patch_size,
-                 self.out_channels))
+                (-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
             hidden_states = paddle.einsum("nhwpqc->nchpwq", hidden_states)
-            output = hidden_states.reshape((
-                -1,
-                self.out_channels,
-                height * self.patch_size,
-                width * self.patch_size, ))
+            output = hidden_states.reshape(
+                (
+                    -1,
+                    self.out_channels,
+                    height * self.patch_size,
+                    width * self.patch_size,
+                )
+            )
 
         if not return_dict:
-            return (output, )
+            return (output,)
 
         return Transformer2DModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_temporal.py b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
index 0052335c043f4..bfd1985eb99a7 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_temporal.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
@@ -60,52 +60,56 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_attention_heads: int=16,
-            attention_head_dim: int=88,
-            in_channels: Optional[int]=None,
-            out_channels: Optional[int]=None,
-            num_layers: int=1,
-            dropout: float=0.0,
-            norm_num_groups: int=32,
-            cross_attention_dim: Optional[int]=None,
-            attention_bias: bool=False,
-            sample_size: Optional[int]=None,
-            activation_fn: str="geglu",
-            norm_elementwise_affine: bool=True,
-            double_self_attention: bool=True, ):
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        double_self_attention: bool = True,
+    ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
         inner_dim = num_attention_heads * attention_head_dim
         self.in_channels = in_channels
-        self.norm = nn.GroupNorm(
-            num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
+        self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
         self.proj_in = nn.Linear(in_channels, inner_dim)
-        self.transformer_blocks = nn.LayerList([
-            BasicTransformerBlock(
-                inner_dim,
-                num_attention_heads,
-                attention_head_dim,
-                dropout=dropout,
-                cross_attention_dim=cross_attention_dim,
-                activation_fn=activation_fn,
-                attention_bias=attention_bias,
-                double_self_attention=double_self_attention,
-                norm_elementwise_affine=norm_elementwise_affine, )
-            for d in range(num_layers)
-        ])
+        self.transformer_blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    double_self_attention=double_self_attention,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                )
+                for d in range(num_layers)
+            ]
+        )
         self.proj_out = nn.Linear(inner_dim, in_channels)
 
     def forward(
-            self,
-            hidden_states,
-            encoder_hidden_states=None,
-            timestep=None,
-            class_labels=None,
-            num_frames=1,
-            cross_attention_kwargs=None,
-            return_dict: bool=True, ):
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        timestep=None,
+        class_labels=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+    ):
         """
         Args:
             hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
@@ -131,12 +135,12 @@ def forward(
         batch_frames, channel, height, width = hidden_states.shape
         batch_size = batch_frames // num_frames
         residual = hidden_states
-        hidden_states = hidden_states[None, :].reshape(
-            (batch_size, num_frames, channel, height, width))
+        hidden_states = hidden_states[None, :].reshape((batch_size, num_frames, channel, height, width))
         hidden_states = hidden_states.transpose([0, 2, 1, 3, 4])
         hidden_states = self.norm(hidden_states)
         hidden_states = hidden_states.transpose([0, 3, 4, 2, 1]).reshape(
-            (batch_size * height * width, num_frames, channel))
+            (batch_size * height * width, num_frames, channel)
+        )
         hidden_states = self.proj_in(hidden_states)
         # 2. Blocks
         for block in self.transformer_blocks:
@@ -145,15 +149,17 @@ def forward(
                 encoder_hidden_states=encoder_hidden_states,
                 timestep=timestep,
                 cross_attention_kwargs=cross_attention_kwargs,
-                class_labels=class_labels, )
+                class_labels=class_labels,
+            )
         # 3. Output
         hidden_states = self.proj_out(hidden_states)
-        hidden_states = (hidden_states[None, None, :].reshape(
-            (batch_size, height, width, channel, num_frames))
-                         .transpose([0, 3, 4, 1, 2]))
-        hidden_states = hidden_states.reshape(
-            (batch_frames, channel, height, width))
+        hidden_states = (
+            hidden_states[None, None, :]
+            .reshape((batch_size, height, width, channel, num_frames))
+            .transpose([0, 3, 4, 1, 2])
+        )
+        hidden_states = hidden_states.reshape((batch_frames, channel, height, width))
         output = hidden_states + residual
         if not return_dict:
-            return (output, )
+            return (output,)
         return TransformerTemporalModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d.py b/ppdiffusers/ppdiffusers/models/unet_1d.py
index 70ecea668c88f..df62f8477b0bb 100644
--- a/ppdiffusers/ppdiffusers/models/unet_1d.py
+++ b/ppdiffusers/ppdiffusers/models/unet_1d.py
@@ -23,8 +23,7 @@
 from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
-from .unet_1d_blocks import (get_down_block, get_mid_block, get_out_block,
-                             get_up_block)
+from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
 
 
 @dataclass
@@ -73,29 +72,30 @@ class UNet1DModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            sample_size: int=65536,
-            sample_rate: Optional[int]=None,
-            in_channels: int=2,
-            out_channels: int=2,
-            extra_in_channels: int=0,
-            time_embedding_type: str="fourier",
-            flip_sin_to_cos: bool=True,
-            use_timestep_embedding: bool=False,
-            freq_shift: float=0.0,
-            down_block_types: Tuple[str]=(
-                "DownBlock1DNoSkip",
-                "DownBlock1D",
-                "AttnDownBlock1D", ),
-            up_block_types: Tuple[str]=("AttnUpBlock1D", "UpBlock1D",
-                                        "UpBlock1DNoSkip"),
-            mid_block_type: Tuple[str]="UNetMidBlock1D",
-            out_block_type: str=None,
-            block_out_channels: Tuple[int]=(32, 32, 64),
-            act_fn: str=None,
-            norm_num_groups: int=8,
-            layers_per_block: int=1,
-            downsample_each_block: bool=False, ):
+        self,
+        sample_size: int = 65536,
+        sample_rate: Optional[int] = None,
+        in_channels: int = 2,
+        out_channels: int = 2,
+        extra_in_channels: int = 0,
+        time_embedding_type: str = "fourier",
+        flip_sin_to_cos: bool = True,
+        use_timestep_embedding: bool = False,
+        freq_shift: float = 0.0,
+        down_block_types: Tuple[str] = (
+            "DownBlock1DNoSkip",
+            "DownBlock1D",
+            "AttnDownBlock1D",
+        ),
+        up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        mid_block_type: Tuple[str] = "UNetMidBlock1D",
+        out_block_type: str = None,
+        block_out_channels: Tuple[int] = (32, 32, 64),
+        act_fn: str = None,
+        norm_num_groups: int = 8,
+        layers_per_block: int = 1,
+        downsample_each_block: bool = False,
+    ):
         super().__init__()
         self.sample_size = sample_size
 
@@ -105,13 +105,15 @@ def __init__(
                 embedding_size=8,
                 set_W_to_weight=False,
                 log=False,
-                flip_sin_to_cos=flip_sin_to_cos, )
+                flip_sin_to_cos=flip_sin_to_cos,
+            )
             timestep_input_dim = 2 * block_out_channels[0]
         elif time_embedding_type == "positional":
             self.time_proj = Timesteps(
                 block_out_channels[0],
                 flip_sin_to_cos=flip_sin_to_cos,
-                downscale_freq_shift=freq_shift, )
+                downscale_freq_shift=freq_shift,
+            )
             timestep_input_dim = block_out_channels[0]
 
         if use_timestep_embedding:
@@ -120,7 +122,8 @@ def __init__(
                 in_channels=timestep_input_dim,
                 time_embed_dim=time_embed_dim,
                 act_fn=act_fn,
-                out_dim=block_out_channels[0], )
+                out_dim=block_out_channels[0],
+            )
 
         self.down_blocks = nn.LayerList([])
         self.mid_block = None
@@ -144,7 +147,8 @@ def __init__(
                 in_channels=input_channel,
                 out_channels=output_channel,
                 temb_channels=block_out_channels[0],
-                add_downsample=not is_final_block or downsample_each_block, )
+                add_downsample=not is_final_block or downsample_each_block,
+            )
             self.down_blocks.append(down_block)
 
         # mid
@@ -155,7 +159,8 @@ def __init__(
             out_channels=block_out_channels[-1],
             embed_dim=block_out_channels[0],
             num_layers=layers_per_block,
-            add_downsample=downsample_each_block, )
+            add_downsample=downsample_each_block,
+        )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
@@ -167,9 +172,9 @@ def __init__(
 
         for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
-            output_channel = (reversed_block_out_channels[i + 1]
-                              if i < len(up_block_types) - 1 else
-                              final_upsample_channels)
+            output_channel = (
+                reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels
+            )
 
             is_final_block = i == len(block_out_channels) - 1
 
@@ -179,26 +184,28 @@ def __init__(
                 in_channels=prev_output_channel,
                 out_channels=output_channel,
                 temb_channels=block_out_channels[0],
-                add_upsample=not is_final_block, )
+                add_upsample=not is_final_block,
+            )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        num_groups_out = (norm_num_groups if norm_num_groups is not None else
-                          min(block_out_channels[0] // 4, 32))
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
         self.out_block = get_out_block(
             out_block_type=out_block_type,
             num_groups_out=num_groups_out,
             embed_dim=block_out_channels[0],
             out_channels=out_channels,
             act_fn=act_fn,
-            fc_dim=block_out_channels[-1] // 4, )
+            fc_dim=block_out_channels[-1] // 4,
+        )
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            return_dict: bool=True, ) -> Union[UNet1DOutput, Tuple]:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        return_dict: bool = True,
+    ) -> Union[UNet1DOutput, Tuple]:
         r"""
         Args:
             sample (`paddle.Tensor`): `(batch_size, num_channels, sample_size)` noisy inputs tensor
@@ -223,16 +230,13 @@ def forward(
             timestep_embed = self.time_mlp(timestep_embed)
         else:
             timestep_embed = timestep_embed[..., None]
-            timestep_embed = timestep_embed.tile(
-                [1, 1, sample.shape[2]]).cast(sample.dtype)
-            timestep_embed = timestep_embed.broadcast_to(
-                (sample.shape[:1] + timestep_embed.shape[1:]))
+            timestep_embed = timestep_embed.tile([1, 1, sample.shape[2]]).cast(sample.dtype)
+            timestep_embed = timestep_embed.broadcast_to((sample.shape[:1] + timestep_embed.shape[1:]))
 
         # 2. down
         down_block_res_samples = ()
         for downsample_block in self.down_blocks:
-            sample, res_samples = downsample_block(
-                hidden_states=sample, temb=timestep_embed)
+            sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
             down_block_res_samples += res_samples
 
         # 3. mid
@@ -243,16 +247,13 @@ def forward(
         for i, upsample_block in enumerate(self.up_blocks):
             res_samples = down_block_res_samples[-1:]
             down_block_res_samples = down_block_res_samples[:-1]
-            sample = upsample_block(
-                sample,
-                res_hidden_states_tuple=res_samples,
-                temb=timestep_embed)
+            sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed)
 
         # 5. post-process
         if self.out_block:
             sample = self.out_block(sample, timestep_embed)
 
         if not return_dict:
-            return (sample, )
+            return (sample,)
 
         return UNet1DOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
index 7b3cf833bfba8..41a1810408693 100644
--- a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
@@ -20,24 +20,24 @@
 from paddle import nn
 
 from ..utils import is_ppxformers_available
-from .resnet import (Downsample1D, ResidualTemporalBlock1D, Upsample1D,
-                     rearrange_dims)
+from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
 
 
 class DownResnetBlock1D(nn.Layer):
     def __init__(
-            self,
-            in_channels,
-            out_channels=None,
-            num_layers=1,
-            conv_shortcut=False,
-            temb_channels=32,
-            groups=32,
-            groups_out=None,
-            non_linearity=None,
-            time_embedding_norm="default",
-            output_scale_factor=1.0,
-            add_downsample=True, ):
+        self,
+        in_channels,
+        out_channels=None,
+        num_layers=1,
+        conv_shortcut=False,
+        temb_channels=32,
+        groups=32,
+        groups_out=None,
+        non_linearity=None,
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        add_downsample=True,
+    ):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
@@ -51,15 +51,10 @@ def __init__(
             groups_out = groups
 
         # there will always be at least one resnet
-        resnets = [
-            ResidualTemporalBlock1D(
-                in_channels, out_channels, embed_dim=temb_channels)
-        ]
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
 
         for _ in range(num_layers):
-            resnets.append(
-                ResidualTemporalBlock1D(
-                    out_channels, out_channels, embed_dim=temb_channels))
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
 
         self.resnets = nn.LayerList(resnets)
 
@@ -74,8 +69,7 @@ def __init__(
 
         self.downsample = None
         if add_downsample:
-            self.downsample = Downsample1D(
-                out_channels, use_conv=True, padding=1)
+            self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
 
     def forward(self, hidden_states, temb=None):
         output_states = ()
@@ -84,7 +78,7 @@ def forward(self, hidden_states, temb=None):
         for resnet in self.resnets[1:]:
             hidden_states = resnet(hidden_states, temb)
 
-        output_states += (hidden_states, )
+        output_states += (hidden_states,)
 
         if self.nonlinearity is not None:
             hidden_states = self.nonlinearity(hidden_states)
@@ -97,17 +91,18 @@ def forward(self, hidden_states, temb=None):
 
 class UpResnetBlock1D(nn.Layer):
     def __init__(
-            self,
-            in_channels,
-            out_channels=None,
-            num_layers=1,
-            temb_channels=32,
-            groups=32,
-            groups_out=None,
-            non_linearity=None,
-            time_embedding_norm="default",
-            output_scale_factor=1.0,
-            add_upsample=True, ):
+        self,
+        in_channels,
+        out_channels=None,
+        num_layers=1,
+        temb_channels=32,
+        groups=32,
+        groups_out=None,
+        non_linearity=None,
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
@@ -120,15 +115,10 @@ def __init__(
             groups_out = groups
 
         # there will always be at least one resnet
-        resnets = [
-            ResidualTemporalBlock1D(
-                2 * in_channels, out_channels, embed_dim=temb_channels)
-        ]
+        resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
 
         for _ in range(num_layers):
-            resnets.append(
-                ResidualTemporalBlock1D(
-                    out_channels, out_channels, embed_dim=temb_channels))
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
 
         self.resnets = nn.LayerList(resnets)
 
@@ -148,8 +138,7 @@ def __init__(
     def forward(self, hidden_states, res_hidden_states_tuple=None, temb=None):
         if res_hidden_states_tuple is not None:
             res_hidden_states = res_hidden_states_tuple[-1]
-            hidden_states = paddle.concat(
-                (hidden_states, res_hidden_states), axis=1)
+            hidden_states = paddle.concat((hidden_states, res_hidden_states), axis=1)
 
         hidden_states = self.resnets[0](hidden_states, temb)
         for resnet in self.resnets[1:]:
@@ -171,11 +160,9 @@ def __init__(self, in_channels, out_channels, embed_dim):
         self.out_channels = out_channels
         self.embed_dim = embed_dim
 
-        self.res1 = ResidualTemporalBlock1D(
-            in_channels, in_channels // 2, embed_dim=embed_dim)
+        self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
         self.down1 = Downsample1D(out_channels // 2, use_conv=True)
-        self.res2 = ResidualTemporalBlock1D(
-            in_channels // 2, in_channels // 4, embed_dim=embed_dim)
+        self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
         self.down2 = Downsample1D(out_channels // 4, use_conv=True)
 
     def forward(self, x, temb=None):
@@ -188,29 +175,25 @@ def forward(self, x, temb=None):
 
 class MidResTemporalBlock1D(nn.Layer):
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            embed_dim,
-            num_layers: int=1,
-            add_downsample: bool=False,
-            add_upsample: bool=False,
-            non_linearity=None, ):
+        self,
+        in_channels,
+        out_channels,
+        embed_dim,
+        num_layers: int = 1,
+        add_downsample: bool = False,
+        add_upsample: bool = False,
+        non_linearity=None,
+    ):
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.add_downsample = add_downsample
 
         # there will always be at least one resnet
-        resnets = [
-            ResidualTemporalBlock1D(
-                in_channels, out_channels, embed_dim=embed_dim)
-        ]
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
 
         for _ in range(num_layers):
-            resnets.append(
-                ResidualTemporalBlock1D(
-                    out_channels, out_channels, embed_dim=embed_dim))
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
 
         self.resnets = nn.LayerList(resnets)
 
@@ -271,11 +254,13 @@ def forward(self, hidden_states, temb=None):
 class OutValueFunctionBlock(nn.Layer):
     def __init__(self, fc_dim, embed_dim):
         super().__init__()
-        self.final_block = nn.LayerList([
-            nn.Linear(fc_dim + embed_dim, fc_dim // 2),
-            nn.Mish(),
-            nn.Linear(fc_dim // 2, 1),
-        ])
+        self.final_block = nn.LayerList(
+            [
+                nn.Linear(fc_dim + embed_dim, fc_dim // 2),
+                nn.Mish(),
+                nn.Linear(fc_dim // 2, 1),
+            ]
+        )
 
     def forward(self, hidden_states, temb):
         hidden_states = hidden_states.reshape([hidden_states.shape[0], -1])
@@ -324,15 +309,11 @@ def __init__(self, kernel="linear", pad_mode="reflect"):
         self.register_buffer("kernel", kernel_1d)
 
     def forward(self, hidden_states):
-        hidden_states = F.pad(hidden_states, (self.pad, ) * 2,
-                              self.pad_mode,
-                              data_format="NCL")
+        hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode, data_format="NCL")
         weight = paddle.zeros(
-            [
-                hidden_states.shape[1], hidden_states.shape[1],
-                self.kernel.shape[0]
-            ],
-            dtype=hidden_states.dtype, )
+            [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]],
+            dtype=hidden_states.dtype,
+        )
         indices = paddle.arange(hidden_states.shape[1])
         weight[indices, indices] = self.kernel.cast(weight.dtype)
         return F.conv1d(hidden_states, weight, stride=2)
@@ -347,19 +328,14 @@ def __init__(self, kernel="linear", pad_mode="reflect"):
         self.register_buffer("kernel", kernel_1d)
 
     def forward(self, hidden_states, temb=None):
-        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2, ) * 2,
-                              self.pad_mode,
-                              data_format="NCL")
+        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode, data_format="NCL")
         weight = paddle.zeros(
-            [
-                hidden_states.shape[1], hidden_states.shape[1],
-                self.kernel.shape[0]
-            ],
-            dtype=hidden_states.dtype, )
+            [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]],
+            dtype=hidden_states.dtype,
+        )
         indices = paddle.arange(hidden_states.shape[1])
         weight[indices, indices] = self.kernel.cast(weight.dtype)
-        return F.conv1d_transpose(
-            hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+        return F.conv1d_transpose(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
 
 
 class SelfAttention1d(nn.Layer):
@@ -395,9 +371,10 @@ def reshape_batch_dim_to_heads(self, tensor, transpose=True):
         return tensor
 
     def set_use_memory_efficient_attention_xformers(
-            self,
-            use_memory_efficient_attention_xformers: bool,
-            attention_op: Optional[str]=None, ):
+        self,
+        use_memory_efficient_attention_xformers: bool,
+        attention_op: Optional[str] = None,
+    ):
         # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
         # if self.head_size > 128 and attention_op == "flash":
         #     attention_op = "cutlass"
@@ -409,18 +386,15 @@ def set_use_memory_efficient_attention_xformers(
             else:
                 try:
                     _ = F.scaled_dot_product_attention_(
-                        paddle.randn(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.randn(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.randn(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op, )
+                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
+                        attention_op=attention_op,
+                    )
                 except Exception as e:
                     raise e
 
-        self._use_memory_efficient_attention_xformers = (
-            use_memory_efficient_attention_xformers)
+        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
         self._attention_op = attention_op
 
     def forward(self, hidden_states):
@@ -434,14 +408,14 @@ def forward(self, hidden_states):
         value_proj = self.value(hidden_states)
 
         query_proj = self.reshape_heads_to_batch_dim(
-            query_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            query_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
         key_proj = self.reshape_heads_to_batch_dim(
-            key_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            key_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
         value_proj = self.reshape_heads_to_batch_dim(
-            value_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            value_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
 
         if self._use_memory_efficient_attention_xformers:
             hidden_states = F.scaled_dot_product_attention_(
@@ -452,19 +426,17 @@ def forward(self, hidden_states):
                 scale=self.scale,
                 dropout_p=0.0,
                 training=self.training,
-                attention_op=self._attention_op, )
+                attention_op=self._attention_op,
+            )
         else:
-            attention_scores = (paddle.matmul(
-                query_proj, key_proj, transpose_y=True) * self.scale)
-            attention_probs = F.softmax(
-                attention_scores.cast("float32"),
-                axis=-1).cast(attention_scores.dtype)
+            attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
+            attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
             hidden_states = paddle.matmul(attention_probs, value_proj)
 
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(
-            hidden_states,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+        )
 
         # compute next hidden_states
         hidden_states = self.proj_attn(hidden_states)
@@ -483,8 +455,7 @@ def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
         self.has_conv_skip = in_channels != out_channels
 
         if self.has_conv_skip:
-            self.conv_skip = nn.Conv1D(
-                in_channels, out_channels, 1, bias_attr=False)
+            self.conv_skip = nn.Conv1D(in_channels, out_channels, 1, bias_attr=False)
 
         self.conv_1 = nn.Conv1D(in_channels, mid_channels, 5, padding=2)
         self.group_norm_1 = nn.GroupNorm(1, mid_channels)
@@ -496,8 +467,7 @@ def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
             self.gelu_2 = nn.GELU()
 
     def forward(self, hidden_states):
-        residual = (self.conv_skip(hidden_states)
-                    if self.has_conv_skip else hidden_states)
+        residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
 
         hidden_states = self.conv_1(hidden_states)
         hidden_states = self.group_norm_1(hidden_states)
@@ -579,7 +549,7 @@ def forward(self, hidden_states, temb=None):
             hidden_states = resnet(hidden_states)
             hidden_states = attn(hidden_states)
 
-        return hidden_states, (hidden_states, )
+        return hidden_states, (hidden_states,)
 
 
 class DownBlock1D(nn.Layer):
@@ -602,7 +572,7 @@ def forward(self, hidden_states, temb=None):
         for resnet in self.resnets:
             hidden_states = resnet(hidden_states)
 
-        return hidden_states, (hidden_states, )
+        return hidden_states, (hidden_states,)
 
 
 class DownBlock1DNoSkip(nn.Layer):
@@ -623,7 +593,7 @@ def forward(self, hidden_states, temb=None):
         for resnet in self.resnets:
             hidden_states = resnet(hidden_states)
 
-        return hidden_states, (hidden_states, )
+        return hidden_states, (hidden_states,)
 
 
 class AttnUpBlock1D(nn.Layer):
@@ -648,8 +618,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         res_hidden_states = res_hidden_states_tuple[-1]
-        hidden_states = paddle.concat(
-            [hidden_states, res_hidden_states], axis=1)
+        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
         for resnet, attn in zip(self.resnets, self.attentions):
             hidden_states = resnet(hidden_states)
@@ -676,8 +645,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         res_hidden_states = res_hidden_states_tuple[-1]
-        hidden_states = paddle.concat(
-            [hidden_states, res_hidden_states], axis=1)
+        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
         for resnet in self.resnets:
             hidden_states = resnet(hidden_states)
@@ -695,16 +663,14 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
         resnets = [
             ResConvBlock(2 * in_channels, mid_channels, mid_channels),
             ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(
-                mid_channels, mid_channels, out_channels, is_last=True),
+            ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
         ]
 
         self.resnets = nn.LayerList(resnets)
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         res_hidden_states = res_hidden_states_tuple[-1]
-        hidden_states = paddle.concat(
-            [hidden_states, res_hidden_states], axis=1)
+        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
         for resnet in self.resnets:
             hidden_states = resnet(hidden_states)
@@ -713,79 +679,77 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
 
 
 def get_down_block(
-        down_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        temb_channels,
-        add_downsample, ):
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+):
     if down_block_type == "DownResnetBlock1D":
         return DownResnetBlock1D(
             in_channels=in_channels,
             num_layers=num_layers,
             out_channels=out_channels,
             temb_channels=temb_channels,
-            add_downsample=add_downsample, )
+            add_downsample=add_downsample,
+        )
     elif down_block_type == "DownBlock1D":
         return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
     elif down_block_type == "AttnDownBlock1D":
-        return AttnDownBlock1D(
-            out_channels=out_channels, in_channels=in_channels)
+        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
     elif down_block_type == "DownBlock1DNoSkip":
-        return DownBlock1DNoSkip(
-            out_channels=out_channels, in_channels=in_channels)
+        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
     raise ValueError(f"{down_block_type} does not exist.")
 
 
-def get_up_block(up_block_type, num_layers, in_channels, out_channels,
-                 temb_channels, add_upsample):
+def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample):
     if up_block_type == "UpResnetBlock1D":
         return UpResnetBlock1D(
             in_channels=in_channels,
             num_layers=num_layers,
             out_channels=out_channels,
             temb_channels=temb_channels,
-            add_upsample=add_upsample, )
+            add_upsample=add_upsample,
+        )
     elif up_block_type == "UpBlock1D":
         return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
     elif up_block_type == "AttnUpBlock1D":
         return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
     elif up_block_type == "UpBlock1DNoSkip":
-        return UpBlock1DNoSkip(
-            in_channels=in_channels, out_channels=out_channels)
+        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
     raise ValueError(f"{up_block_type} does not exist.")
 
 
 def get_mid_block(
-        mid_block_type,
-        num_layers,
-        in_channels,
-        mid_channels,
-        out_channels,
-        embed_dim,
-        add_downsample, ):
+    mid_block_type,
+    num_layers,
+    in_channels,
+    mid_channels,
+    out_channels,
+    embed_dim,
+    add_downsample,
+):
     if mid_block_type == "MidResTemporalBlock1D":
         return MidResTemporalBlock1D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
             embed_dim=embed_dim,
-            add_downsample=add_downsample, )
+            add_downsample=add_downsample,
+        )
     elif mid_block_type == "ValueFunctionMidBlock1D":
-        return ValueFunctionMidBlock1D(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            embed_dim=embed_dim)
+        return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
     elif mid_block_type == "UNetMidBlock1D":
         return UNetMidBlock1D(
             in_channels=in_channels,
             mid_channels=mid_channels,
-            out_channels=out_channels, )
+            out_channels=out_channels,
+        )
     raise ValueError(f"{mid_block_type} does not exist.")
 
 
-def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels,
-                  act_fn, fc_dim):
+def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim):
     if out_block_type == "OutConv1DBlock":
         return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
     elif out_block_type == "ValueFunction":
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d.py b/ppdiffusers/ppdiffusers/models/unet_2d.py
index c3bcf99332789..f66b21a6a9e50 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d.py
@@ -83,37 +83,40 @@ class conditioning with `class_embed_type` equal to `None`.
 
     @register_to_config
     def __init__(
-            self,
-            sample_size: Optional[Union[int, Tuple[int, int]]]=None,
-            in_channels: int=3,
-            out_channels: int=3,
-            center_input_sample: bool=False,
-            time_embedding_type: str="positional",
-            freq_shift: int=0,
-            flip_sin_to_cos: bool=True,
-            down_block_types: Tuple[str]=(
-                "DownBlock2D",
-                "AttnDownBlock2D",
-                "AttnDownBlock2D",
-                "AttnDownBlock2D", ),
-            up_block_types: Tuple[str]=(
-                "AttnUpBlock2D",
-                "AttnUpBlock2D",
-                "AttnUpBlock2D",
-                "UpBlock2D", ),
-            block_out_channels: Tuple[int]=(224, 448, 672, 896),
-            layers_per_block: int=2,
-            mid_block_scale_factor: float=1,
-            downsample_padding: int=1,
-            act_fn: str="silu",
-            attention_head_dim: Optional[int]=8,
-            norm_num_groups: int=32,
-            norm_eps: float=1e-5,
-            resnet_time_scale_shift: str="default",
-            add_attention: bool=True,
-            class_embed_type: Optional[str]=None,
-            num_class_embeds: Optional[int]=None,
-            resnet_pre_temb_non_linearity: Optional[bool]=False, ):
+        self,
+        sample_size: Optional[Union[int, Tuple[int, int]]] = None,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        center_input_sample: bool = False,
+        time_embedding_type: str = "positional",
+        freq_shift: int = 0,
+        flip_sin_to_cos: bool = True,
+        down_block_types: Tuple[str] = (
+            "DownBlock2D",
+            "AttnDownBlock2D",
+            "AttnDownBlock2D",
+            "AttnDownBlock2D",
+        ),
+        up_block_types: Tuple[str] = (
+            "AttnUpBlock2D",
+            "AttnUpBlock2D",
+            "AttnUpBlock2D",
+            "UpBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (224, 448, 672, 896),
+        layers_per_block: int = 2,
+        mid_block_scale_factor: float = 1,
+        downsample_padding: int = 1,
+        act_fn: str = "silu",
+        attention_head_dim: Optional[int] = 8,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        resnet_time_scale_shift: str = "default",
+        add_attention: bool = True,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        resnet_pre_temb_non_linearity: Optional[bool] = False,
+    ):
         super().__init__()
 
         self.sample_size = sample_size
@@ -131,29 +134,23 @@ def __init__(
             )
 
         # input
-        self.conv_in = nn.Conv2D(
-            in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
 
         # time
         if time_embedding_type == "fourier":
-            self.time_proj = GaussianFourierProjection(
-                embedding_size=block_out_channels[0], scale=16)
+            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
             timestep_input_dim = 2 * block_out_channels[0]
         elif time_embedding_type == "positional":
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
-                                       freq_shift)
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
 
-        self.time_embedding = TimestepEmbedding(timestep_input_dim,
-                                                time_embed_dim)
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
         # class embedding
         if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds,
-                                                time_embed_dim)
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
         elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim,
-                                                     time_embed_dim)
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
         else:
@@ -195,7 +192,8 @@ def __init__(
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
             self.down_blocks.append(down_block)
 
         # mid
@@ -209,7 +207,8 @@ def __init__(
             attn_num_head_channels=attention_head_dim,
             resnet_groups=norm_num_groups,
             add_attention=add_attention,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
@@ -217,8 +216,7 @@ def __init__(
         for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(
-                i + 1, len(block_out_channels) - 1)]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
 
             is_final_block = i == len(block_out_channels) - 1
 
@@ -235,27 +233,28 @@ def __init__(
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=attention_head_dim,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        num_groups_out = (norm_num_groups if norm_num_groups is not None else
-                          min(block_out_channels[0] // 4, 32))
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
         self.conv_norm_out = nn.GroupNorm(
             num_channels=block_out_channels[0],
             num_groups=num_groups_out,
-            epsilon=norm_eps, )
+            epsilon=norm_eps,
+        )
         self.conv_act = nn.Silu()
-        self.conv_out = nn.Conv2D(
-            block_out_channels[0], out_channels, kernel_size=3, padding=1)
+        self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            class_labels: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ) -> Union[UNet2DOutput, Tuple]:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        class_labels: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DOutput, Tuple]:
         r"""
         Args:
             sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
@@ -284,7 +283,11 @@ def forward(
             timesteps = timesteps[None]
 
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand([sample.shape[0], ])
+        timesteps = timesteps.expand(
+            [
+                sample.shape[0],
+            ]
+        )
 
         t_emb = self.time_proj(timesteps)
 
@@ -296,9 +299,7 @@ def forward(
 
         if self.class_embedding is not None:
             if class_labels is None:
-                raise ValueError(
-                    "class_labels should be provided when doing class conditioning"
-                )
+                raise ValueError("class_labels should be provided when doing class conditioning")
 
             class_labels = class_labels.cast(self.dtype)
 
@@ -315,7 +316,7 @@ def forward(
         sample = self.conv_in(sample)
 
         # 3. down
-        down_block_res_samples = (sample, )
+        down_block_res_samples = (sample,)
 
         if self.resnet_pre_temb_non_linearity:
             emb = self.down_resnet_temb_nonlinearity(emb)
@@ -323,10 +324,10 @@ def forward(
         for downsample_block in self.down_blocks:
             if hasattr(downsample_block, "skip_conv"):
                 sample, res_samples, skip_sample = downsample_block(
-                    hidden_states=sample, temb=emb, skip_sample=skip_sample)
+                    hidden_states=sample, temb=emb, skip_sample=skip_sample
+                )
             else:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
 
@@ -336,13 +337,11 @@ def forward(
         # 5. up
         skip_sample = None
         for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
-            down_block_res_samples = down_block_res_samples[:-len(
-                upsample_block.resnets)]
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
             if hasattr(upsample_block, "skip_conv"):
-                sample, skip_sample = upsample_block(sample, res_samples, emb,
-                                                     skip_sample)
+                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
             else:
                 sample = upsample_block(sample, res_samples, emb)
 
@@ -355,11 +354,10 @@ def forward(
             sample += skip_sample
 
         if self.config.time_embedding_type == "fourier":
-            timesteps = timesteps.reshape(
-                [sample.shape[0], *([1] * len(sample.shape[1:]))])
+            timesteps = timesteps.reshape([sample.shape[0], *([1] * len(sample.shape[1:]))])
             sample = sample / timesteps
 
         if not return_dict:
-            return (sample, )
+            return (sample,)
 
         return UNet2DOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
index b49e5263c2077..5bfa7a33dcbff 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
@@ -22,36 +22,42 @@
 from .attention import AdaGroupNorm, AttentionBlock
 from .attention_processor import Attention, AttnAddedKVProcessor
 from .dual_transformer_2d import DualTransformer2DModel
-from .resnet import (Downsample2D, FirDownsample2D, FirUpsample2D,
-                     KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D)
+from .resnet import (
+    Downsample2D,
+    FirDownsample2D,
+    FirUpsample2D,
+    KDownsample2D,
+    KUpsample2D,
+    ResnetBlock2D,
+    Upsample2D,
+)
 from .transformer_2d import Transformer2DModel
 
 
 def get_down_block(
-        down_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        temb_channels,
-        add_downsample,
-        resnet_eps,
-        resnet_act_fn,
-        attn_num_head_channels,
-        resnet_groups=None,
-        cross_attention_dim=None,
-        downsample_padding=None,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-        resnet_time_scale_shift="default",
-        resnet_skip_time_act=False,
-        resnet_out_scale_factor=1.0,
-        cross_attention_norm=None,
-        resnet_pre_temb_non_linearity=False, ):
-    down_block_type = (down_block_type[7:]
-                       if down_block_type.startswith("UNetRes") else
-                       down_block_type)
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    resnet_pre_temb_non_linearity=False,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlock2D":
         return DownBlock2D(
             num_layers=num_layers,
@@ -64,7 +70,8 @@ def get_down_block(
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "ResnetDownsampleBlock2D":
         return ResnetDownsampleBlock2D(
             num_layers=num_layers,
@@ -78,7 +85,8 @@ def get_down_block(
             resnet_time_scale_shift=resnet_time_scale_shift,
             skip_time_act=resnet_skip_time_act,
             output_scale_factor=resnet_out_scale_factor,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "AttnDownBlock2D":
         return AttnDownBlock2D(
             num_layers=num_layers,
@@ -92,11 +100,11 @@ def get_down_block(
             downsample_padding=downsample_padding,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "CrossAttnDownBlock2D":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnDownBlock2D")
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
         return CrossAttnDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -114,12 +122,11 @@ def get_down_block(
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "SimpleCrossAttnDownBlock2D":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D"
-            )
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
         return SimpleCrossAttnDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -136,7 +143,8 @@ def get_down_block(
             output_scale_factor=resnet_out_scale_factor,
             only_cross_attention=only_cross_attention,
             cross_attention_norm=cross_attention_norm,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
             num_layers=num_layers,
@@ -148,7 +156,8 @@ def get_down_block(
             resnet_act_fn=resnet_act_fn,
             downsample_padding=downsample_padding,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "AttnSkipDownBlock2D":
         return AttnSkipDownBlock2D(
             num_layers=num_layers,
@@ -161,7 +170,8 @@ def get_down_block(
             downsample_padding=downsample_padding,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "DownEncoderBlock2D":
         return DownEncoderBlock2D(
             num_layers=num_layers,
@@ -173,7 +183,8 @@ def get_down_block(
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "AttnDownEncoderBlock2D":
         return AttnDownEncoderBlock2D(
             num_layers=num_layers,
@@ -186,7 +197,8 @@ def get_down_block(
             downsample_padding=downsample_padding,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "KDownBlock2D":
         return KDownBlock2D(
             num_layers=num_layers,
@@ -196,7 +208,8 @@ def get_down_block(
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "KCrossAttnDownBlock2D":
         return KCrossAttnDownBlock2D(
             num_layers=num_layers,
@@ -209,34 +222,35 @@ def get_down_block(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             add_self_attention=True if not add_downsample else False,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     raise ValueError(f"{down_block_type} does not exist.")
 
 
 def get_up_block(
-        up_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        prev_output_channel,
-        temb_channels,
-        add_upsample,
-        resnet_eps,
-        resnet_act_fn,
-        attn_num_head_channels,
-        resnet_groups=None,
-        cross_attention_dim=None,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-        resnet_time_scale_shift="default",
-        resnet_skip_time_act=False,
-        resnet_out_scale_factor=1.0,
-        cross_attention_norm=None,
-        resnet_pre_temb_non_linearity=False, ):
-    up_block_type = (up_block_type[7:]
-                     if up_block_type.startswith("UNetRes") else up_block_type)
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    resnet_pre_temb_non_linearity=False,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlock2D":
         return UpBlock2D(
             num_layers=num_layers,
@@ -249,7 +263,8 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "ResnetUpsampleBlock2D":
         return ResnetUpsampleBlock2D(
             num_layers=num_layers,
@@ -264,11 +279,11 @@ def get_up_block(
             resnet_time_scale_shift=resnet_time_scale_shift,
             skip_time_act=resnet_skip_time_act,
             output_scale_factor=resnet_out_scale_factor,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "CrossAttnUpBlock2D":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnUpBlock2D")
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
         return CrossAttnUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -286,12 +301,11 @@ def get_up_block(
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "SimpleCrossAttnUpBlock2D":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D"
-            )
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
         return SimpleCrossAttnUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -309,7 +323,8 @@ def get_up_block(
             output_scale_factor=resnet_out_scale_factor,
             only_cross_attention=only_cross_attention,
             cross_attention_norm=cross_attention_norm,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
             num_layers=num_layers,
@@ -323,7 +338,8 @@ def get_up_block(
             resnet_groups=resnet_groups,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "SkipUpBlock2D":
         return SkipUpBlock2D(
             num_layers=num_layers,
@@ -335,7 +351,8 @@ def get_up_block(
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "AttnSkipUpBlock2D":
         return AttnSkipUpBlock2D(
             num_layers=num_layers,
@@ -348,7 +365,8 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "UpDecoderBlock2D":
         return UpDecoderBlock2D(
             num_layers=num_layers,
@@ -359,7 +377,8 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "AttnUpDecoderBlock2D":
         return AttnUpDecoderBlock2D(
             num_layers=num_layers,
@@ -371,7 +390,8 @@ def get_up_block(
             resnet_groups=resnet_groups,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "KUpBlock2D":
         return KUpBlock2D(
             num_layers=num_layers,
@@ -381,7 +401,8 @@ def get_up_block(
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "KCrossAttnUpBlock2D":
         return KCrossAttnUpBlock2D(
             num_layers=num_layers,
@@ -393,30 +414,31 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
 
     raise ValueError(f"{up_block_type} does not exist.")
 
 
 class UNetMidBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            add_attention: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
-        resnet_groups = (resnet_groups if resnet_groups is not None else
-                         min(in_channels // 4, 32))
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         self.add_attention = add_attention
 
         # there is always at least one resnet
@@ -432,7 +454,8 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         ]
         attentions = []
 
@@ -444,7 +467,9 @@ def __init__(
                         num_head_channels=attn_num_head_channels,
                         rescale_output_factor=output_scale_factor,
                         eps=resnet_eps,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
             else:
                 attentions.append(None)
 
@@ -460,7 +485,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
@@ -477,29 +504,29 @@ def forward(self, hidden_states, temb=None):
 
 class UNetMidBlock2DCrossAttn(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            cross_attention_dim: int=1280,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
 
         self.has_cross_attention = True
         self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (resnet_groups if resnet_groups is not None else
-                         min(in_channels // 4, 32))
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
 
         # there is always at least one resnet
         resnets = [
@@ -514,7 +541,8 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         ]
         attentions = []
 
@@ -529,7 +557,9 @@ def __init__(
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
-                        upcast_attention=upcast_attention, ))
+                        upcast_attention=upcast_attention,
+                    )
+                )
             else:
                 attentions.append(
                     DualTransformer2DModel(
@@ -538,7 +568,9 @@ def __init__(
                         in_channels=in_channels,
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
             resnets.append(
                 ResnetBlock2D(
                     in_channels=in_channels,
@@ -551,24 +583,28 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             hidden_states = attn(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -576,30 +612,30 @@ def forward(
 
 class UNetMidBlock2DSimpleCrossAttn(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            cross_attention_dim: int=1280,
-            skip_time_act=False,
-            only_cross_attention=False,
-            cross_attention_norm=None,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act=False,
+        only_cross_attention=False,
+        cross_attention_norm=None,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
 
         self.has_cross_attention = True
 
         self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (resnet_groups if resnet_groups is not None else
-                         min(in_channels // 4, 32))
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
 
         self.num_heads = in_channels // self.attn_num_head_channels
 
@@ -617,7 +653,8 @@ def __init__(
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
                 skip_time_act=skip_time_act,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         ]
         attentions = []
 
@@ -639,7 +676,9 @@ def __init__(
                     upcast_softmax=True,
                     only_cross_attention=only_cross_attention,
                     cross_attention_norm=cross_attention_norm,
-                    processor=processor, ))
+                    processor=processor,
+                )
+            )
             resnets.append(
                 ResnetBlock2D(
                     in_channels=in_channels,
@@ -653,20 +692,22 @@ def __init__(
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
                     skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
-        cross_attention_kwargs = (cross_attention_kwargs if
-                                  cross_attention_kwargs is not None else {})
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             # attn
@@ -674,7 +715,8 @@ def forward(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
 
             # resnet
             hidden_states = resnet(hidden_states, temb)
@@ -684,22 +726,23 @@ def forward(
 
 class AttnDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            downsample_padding: int=1,
-            add_downsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -718,27 +761,34 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             attentions.append(
                 AttentionBlock(
                     out_channels,
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
@@ -748,40 +798,41 @@ def forward(self, hidden_states, temb=None):
         for resnet, attn in zip(self.resnets, self.attentions):
             hidden_states = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
 
 class CrossAttnDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            cross_attention_dim: int=1280,
-            output_scale_factor: float=1.0,
-            downsample_padding: int=1,
-            add_downsample: bool=True,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            only_cross_attention: bool=False,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -803,7 +854,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             if not dual_cross_attention:
                 attentions.append(
                     Transformer2DModel(
@@ -815,7 +868,9 @@ def __init__(
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention, ))
+                        upcast_attention=upcast_attention,
+                    )
+                )
             else:
                 attentions.append(
                     DualTransformer2DModel(
@@ -824,99 +879,103 @@ def __init__(
                         in_channels=out_channels,
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None,
-            additional_residuals=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        additional_residuals=None,
+    ):
         # TODO(Patrick, William) - attention mask is not used
         output_states = ()
 
         for resnet, attn in zip(self.resnets, self.attentions):
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
                         if return_dict is not None:
-                            return module(
-                                *inputs, return_dict=return_dict)[
-                                    0]  # move [0] when paddlepaddle <= 2.4.1
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0] when paddlepaddle <= 2.4.1
                         else:
                             return module(*inputs)
 
                     return custom_forward
 
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(
-                        attn, return_dict=False),
+                    create_custom_forward(attn, return_dict=False),
                     hidden_states,
                     encoder_hidden_states,
-                    cross_attention_kwargs, )  # [0]
+                    cross_attention_kwargs,
+                )  # [0]
             else:
                 hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if additional_residuals is not None:
             hidden_states += additional_residuals
 
             # westfish: add to align with torch features
-            output_states = tuple(output_states[:-1]) + (hidden_states, )
+            output_states = tuple(output_states[:-1]) + (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
 
 class DownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_downsample: bool=True,
-            downsample_padding: int=1,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
@@ -934,19 +993,24 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
@@ -956,8 +1020,7 @@ def forward(self, hidden_states, temb=None):
         output_states = ()
 
         for resnet in self.resnets:
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -965,38 +1028,38 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
 
 class DownEncoderBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_downsample: bool=True,
-            downsample_padding: int=1,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
@@ -1014,19 +1077,24 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
@@ -1043,21 +1111,22 @@ def forward(self, hidden_states):
 
 class AttnDownEncoderBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            add_downsample: bool=True,
-            downsample_padding: int=1,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -1076,27 +1145,34 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             attentions.append(
                 AttentionBlock(
                     out_channels,
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
@@ -1114,21 +1190,22 @@ def forward(self, hidden_states):
 
 class AttnSkipDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=np.sqrt(2.0),
-            downsample_padding: int=1,
-            add_downsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         self.attentions = nn.LayerList([])
         self.resnets = nn.LayerList([])
@@ -1148,13 +1225,17 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             self.attentions.append(
                 AttentionBlock(
                     out_channels,
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps, ))
+                    eps=resnet_eps,
+                )
+            )
 
         if add_downsample:
             self.resnet_down = ResnetBlock2D(
@@ -1171,12 +1252,10 @@ def __init__(
                 use_in_shortcut=True,
                 down=True,
                 kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            self.downsamplers = nn.LayerList(
-                [FirDownsample2D(
-                    out_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2D(
-                3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
+            self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
             self.downsamplers = None
@@ -1188,7 +1267,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
         for resnet, attn in zip(self.resnets, self.attentions):
             hidden_states = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             hidden_states = self.resnet_down(hidden_states, temb)
@@ -1197,27 +1276,28 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
 
             hidden_states = self.skip_conv(skip_sample) + hidden_states
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states, skip_sample
 
 
 class SkipDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=np.sqrt(2.0),
-            add_downsample: bool=True,
-            downsample_padding: int=1,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         self.resnets = nn.LayerList([])
 
@@ -1236,7 +1316,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         if add_downsample:
             self.resnet_down = ResnetBlock2D(
@@ -1253,12 +1335,10 @@ def __init__(
                 use_in_shortcut=True,
                 down=True,
                 kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            self.downsamplers = nn.LayerList(
-                [FirDownsample2D(
-                    out_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2D(
-                3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
+            self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
             self.downsamplers = None
@@ -1269,7 +1349,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
 
         for resnet in self.resnets:
             hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             hidden_states = self.resnet_down(hidden_states, temb)
@@ -1278,28 +1358,29 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
 
             hidden_states = self.skip_conv(skip_sample) + hidden_states
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states, skip_sample
 
 
 class ResnetDownsampleBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_downsample: bool=True,
-            skip_time_act: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
@@ -1318,27 +1399,32 @@ def __init__(
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
                     skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                ResnetBlock2D(
-                    in_channels=out_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    down=True,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
@@ -1348,8 +1434,7 @@ def forward(self, hidden_states, temb=None):
         output_states = ()
 
         for resnet in self.resnets:
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1357,43 +1442,43 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states, temb)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
 
 class SimpleCrossAttnDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            cross_attention_dim: int=1280,
-            output_scale_factor: float=1.0,
-            add_downsample: bool=True,
-            skip_time_act=False,
-            only_cross_attention=False,
-            cross_attention_norm=None,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act=False,
+        only_cross_attention=False,
+        cross_attention_norm=None,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
 
         self.has_cross_attention = True
@@ -1419,7 +1504,9 @@ def __init__(
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
                     skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             # TODO use AttnAddedKVProcessor2_5
             # processor = (
             #     AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
@@ -1437,42 +1524,47 @@ def __init__(
                     upcast_softmax=True,
                     only_cross_attention=only_cross_attention,
                     cross_attention_norm=cross_attention_norm,
-                    processor=processor, ))
+                    processor=processor,
+                )
+            )
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                ResnetBlock2D(
-                    in_channels=out_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    down=True,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
         output_states = ()
-        cross_attention_kwargs = (cross_attention_kwargs if
-                                  cross_attention_kwargs is not None else {})
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
 
         for resnet, attn in zip(self.resnets, self.attentions):
             # resnet
@@ -1483,32 +1575,34 @@ def forward(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states, temb)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
 
 class KDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=4,
-            resnet_eps: float=1e-5,
-            resnet_act_fn: str="gelu",
-            resnet_group_size: int=32,
-            add_downsample: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        add_downsample: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
@@ -1529,7 +1623,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     time_embedding_norm="ada_group",
                     conv_shortcut_bias=False,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
@@ -1545,8 +1641,7 @@ def forward(self, hidden_states, temb=None):
         output_states = ()
 
         for resnet in self.resnets:
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1554,12 +1649,11 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
@@ -1570,20 +1664,21 @@ def custom_forward(*inputs):
 
 class KCrossAttnDownBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            cross_attention_dim: int,
-            dropout: float=0.0,
-            num_layers: int=4,
-            resnet_group_size: int=32,
-            add_downsample=True,
-            attn_num_head_channels: int=64,
-            add_self_attention: bool=False,
-            resnet_eps: float=1e-5,
-            resnet_act_fn: str="gelu",
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        cross_attention_dim: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_group_size: int = 32,
+        add_downsample=True,
+        attn_num_head_channels: int = 64,
+        add_self_attention: bool = False,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -1607,7 +1702,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     time_embedding_norm="ada_group",
                     conv_shortcut_bias=False,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             attentions.append(
                 KAttentionBlock(
                     out_channels,
@@ -1618,7 +1715,9 @@ def __init__(
                     attention_bias=True,
                     add_self_attention=add_self_attention,
                     cross_attention_norm="layer_norm",
-                    group_size=resnet_group_size, ))
+                    group_size=resnet_group_size,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
         self.attentions = nn.LayerList(attentions)
@@ -1631,17 +1730,17 @@ def __init__(
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
         output_states = ()
 
         for resnet, attn in zip(self.resnets, self.attentions):
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
@@ -1652,15 +1751,14 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(
-                        attn, return_dict=False),
+                    create_custom_forward(attn, return_dict=False),
                     hidden_states,
                     encoder_hidden_states,
                     attention_mask,
-                    cross_attention_kwargs, )
+                    cross_attention_kwargs,
+                )
             else:
                 hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
@@ -1668,12 +1766,13 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     emb=temb,
                     attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs, )
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
 
             if self.downsamplers is None:
-                output_states += (None, )
+                output_states += (None,)
             else:
-                output_states += (hidden_states, )
+                output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
@@ -1684,29 +1783,29 @@ def custom_forward(*inputs):
 
 class AttnUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -1721,23 +1820,24 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             attentions.append(
                 AttentionBlock(
                     out_channels,
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
@@ -1746,8 +1846,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             hidden_states = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
@@ -1761,27 +1860,28 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
 
 class CrossAttnUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            prev_output_channel: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            cross_attention_dim: int=1280,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            only_cross_attention: bool=False,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -1790,8 +1890,7 @@ def __init__(
         self.attn_num_head_channels = attn_num_head_channels
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -1806,7 +1905,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             if not dual_cross_attention:
                 attentions.append(
                     Transformer2DModel(
@@ -1818,7 +1919,9 @@ def __init__(
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention, ))
+                        upcast_attention=upcast_attention,
+                    )
+                )
             else:
                 attentions.append(
                     DualTransformer2DModel(
@@ -1827,64 +1930,61 @@ def __init__(
                         in_channels=out_channels,
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            res_hidden_states_tuple,
-            temb=None,
-            encoder_hidden_states=None,
-            cross_attention_kwargs=None,
-            upsample_size=None,
-            attention_mask=None, ):
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        cross_attention_kwargs=None,
+        upsample_size=None,
+        attention_mask=None,
+    ):
         # TODO(Patrick, William) - attention mask is not used
         for resnet, attn in zip(self.resnets, self.attentions):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
                         if return_dict is not None:
-                            return module(
-                                *inputs, return_dict=return_dict)[0]  # move [0]
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0]
                         else:
                             return module(*inputs)
 
                     return custom_forward
 
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(
-                        attn, return_dict=False),
+                    create_custom_forward(attn, return_dict=False),
                     hidden_states,
                     encoder_hidden_states,
-                    cross_attention_kwargs, )  # [0]
+                    cross_attention_kwargs,
+                )  # [0]
             else:
                 hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -1895,27 +1995,27 @@ def custom_forward(*inputs):
 
 class UpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -1930,34 +2030,27 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
         self.gradient_checkpointing = False
 
-    def forward(self,
-                hidden_states,
-                res_hidden_states_tuple,
-                temb=None,
-                upsample_size=None):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1965,8 +2058,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
@@ -1979,19 +2071,20 @@ def custom_forward(*inputs):
 
 class UpDecoderBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
@@ -2010,15 +2103,14 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
@@ -2035,20 +2127,21 @@ def forward(self, hidden_states):
 
 class AttnUpDecoderBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -2068,23 +2161,24 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             attentions.append(
                 AttentionBlock(
                     out_channels,
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
@@ -2102,29 +2196,29 @@ def forward(self, hidden_states):
 
 class AttnSkipUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=np.sqrt(2.0),
-            upsample_padding: int=1,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        upsample_padding: int = 1,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         self.attentions = nn.LayerList([])
         self.resnets = nn.LayerList([])
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             self.resnets.append(
@@ -2140,14 +2234,18 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.attentions.append(
             AttentionBlock(
                 out_channels,
                 num_head_channels=attn_num_head_channels,
                 rescale_output_factor=output_scale_factor,
-                eps=resnet_eps, ))
+                eps=resnet_eps,
+            )
+        )
 
         self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
         if add_upsample:
@@ -2166,17 +2264,14 @@ def __init__(
                 use_in_shortcut=True,
                 up=True,
                 kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            self.skip_conv = nn.Conv2D(
-                out_channels,
-                3,
-                kernel_size=(3, 3),
-                stride=(1, 1),
-                padding=(1, 1))
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
+            self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
             self.skip_norm = nn.GroupNorm(
                 num_groups=min(out_channels // 4, 32),
                 num_channels=out_channels,
-                epsilon=resnet_eps, )
+                epsilon=resnet_eps,
+            )
             self.act = nn.Silu()
         else:
             self.resnet_up = None
@@ -2184,17 +2279,12 @@ def __init__(
             self.skip_norm = None
             self.act = None
 
-    def forward(self,
-                hidden_states,
-                res_hidden_states_tuple,
-                temb=None,
-                skip_sample=None):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             hidden_states = resnet(hidden_states, temb)
 
@@ -2219,27 +2309,27 @@ def forward(self,
 
 class SkipUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=np.sqrt(2.0),
-            add_upsample: bool=True,
-            upsample_padding: int=1,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         self.resnets = nn.LayerList([])
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             self.resnets.append(
@@ -2248,15 +2338,16 @@ def __init__(
                     out_channels=out_channels,
                     temb_channels=temb_channels,
                     eps=resnet_eps,
-                    groups=min((resnet_in_channels + res_skip_channels) // 4,
-                               32),
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
                     groups_out=min(out_channels // 4, 32),
                     dropout=dropout,
                     time_embedding_norm=resnet_time_scale_shift,
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
         if add_upsample:
@@ -2275,17 +2366,14 @@ def __init__(
                 use_in_shortcut=True,
                 up=True,
                 kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            self.skip_conv = nn.Conv2D(
-                out_channels,
-                3,
-                kernel_size=(3, 3),
-                stride=(1, 1),
-                padding=(1, 1))
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
+            self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
             self.skip_norm = nn.GroupNorm(
                 num_groups=min(out_channels // 4, 32),
                 num_channels=out_channels,
-                epsilon=resnet_eps, )
+                epsilon=resnet_eps,
+            )
             self.act = nn.Silu()
         else:
             self.resnet_up = None
@@ -2293,17 +2381,12 @@ def __init__(
             self.skip_norm = None
             self.act = None
 
-    def forward(self,
-                hidden_states,
-                res_hidden_states_tuple,
-                temb=None,
-                skip_sample=None):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             hidden_states = resnet(hidden_states, temb)
 
@@ -2326,28 +2409,28 @@ def forward(self,
 
 class ResnetUpsampleBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            skip_time_act=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act=False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -2363,46 +2446,45 @@ def __init__(
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
                     skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                ResnetBlock2D(
-                    in_channels=out_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    up=True,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            ])
+            self.upsamplers = nn.LayerList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                    )
+                ]
+            )
         else:
             self.upsamplers = None
 
         self.gradient_checkpointing = False
 
-    def forward(self,
-                hidden_states,
-                res_hidden_states_tuple,
-                temb=None,
-                upsample_size=None):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -2410,8 +2492,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
@@ -2424,26 +2505,27 @@ def custom_forward(*inputs):
 
 class SimpleCrossAttnUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            prev_output_channel: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            cross_attention_dim: int=1280,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            skip_time_act=False,
-            only_cross_attention=False,
-            cross_attention_norm=None,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act=False,
+        only_cross_attention=False,
+        cross_attention_norm=None,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -2454,8 +2536,7 @@ def __init__(
         self.num_heads = out_channels // self.attn_num_head_channels
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -2471,7 +2552,9 @@ def __init__(
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
                     skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             # TODO support AttnAddedKVProcessor2_5
             # processor = (
             #     AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
@@ -2489,50 +2572,54 @@ def __init__(
                     upcast_softmax=True,
                     only_cross_attention=only_cross_attention,
                     cross_attention_norm=cross_attention_norm,
-                    processor=processor, ))
+                    processor=processor,
+                )
+            )
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                ResnetBlock2D(
-                    in_channels=out_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    up=True,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
-            ])
+            self.upsamplers = nn.LayerList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                    )
+                ]
+            )
         else:
             self.upsamplers = None
 
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            res_hidden_states_tuple,
-            temb=None,
-            encoder_hidden_states=None,
-            upsample_size=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
-        cross_attention_kwargs = (cross_attention_kwargs if
-                                  cross_attention_kwargs is not None else {})
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         for resnet, attn in zip(self.resnets, self.attentions):
             # resnet
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             hidden_states = resnet(hidden_states, temb)
 
@@ -2541,7 +2628,8 @@ def forward(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2552,17 +2640,18 @@ def forward(
 
 class KUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=5,
-            resnet_eps: float=1e-5,
-            resnet_act_fn: str="gelu",
-            resnet_group_size: Optional[int]=32,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 5,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: Optional[int] = 32,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         k_in_channels = 2 * out_channels
@@ -2577,8 +2666,7 @@ def __init__(
             resnets.append(
                 ResnetBlock2D(
                     in_channels=in_channels,
-                    out_channels=k_out_channels
-                    if (i == num_layers - 1) else out_channels,
+                    out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
                     temb_channels=temb_channels,
                     eps=resnet_eps,
                     groups=groups,
@@ -2587,7 +2675,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     time_embedding_norm="ada_group",
                     conv_shortcut_bias=False,
-                    pre_norm=resnet_pre_temb_non_linearity, ))
+                    pre_norm=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
@@ -2598,19 +2688,13 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def forward(self,
-                hidden_states,
-                res_hidden_states_tuple,
-                temb=None,
-                upsample_size=None):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
         res_hidden_states_tuple = res_hidden_states_tuple[-1]
         if res_hidden_states_tuple is not None:
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states_tuple], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
 
         for resnet in self.resnets:
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -2618,8 +2702,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
@@ -2632,20 +2715,21 @@ def custom_forward(*inputs):
 
 class KCrossAttnUpBlock2D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=4,
-            resnet_eps: float=1e-5,
-            resnet_act_fn: str="gelu",
-            resnet_group_size: int=32,
-            attn_num_head_channels=1,  # attention dim_head
-            cross_attention_dim: int=768,
-            add_upsample: bool=True,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        attn_num_head_channels=1,  # attention dim_head
+        cross_attention_dim: int = 768,
+        add_upsample: bool = True,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -2686,20 +2770,24 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     time_embedding_norm="ada_group",
                     conv_shortcut_bias=False,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             attentions.append(
                 KAttentionBlock(
                     k_out_channels if (i == num_layers - 1) else out_channels,
                     k_out_channels // attn_num_head_channels
-                    if (i == num_layers - 1) else out_channels //
-                    attn_num_head_channels,
+                    if (i == num_layers - 1)
+                    else out_channels // attn_num_head_channels,
                     attn_num_head_channels,
                     cross_attention_dim=cross_attention_dim,
                     temb_channels=temb_channels,
                     attention_bias=True,
                     add_self_attention=add_self_attention,
                     cross_attention_norm="layer_norm",
-                    upcast_attention=upcast_attention, ))
+                    upcast_attention=upcast_attention,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
         self.attentions = nn.LayerList(attentions)
@@ -2712,42 +2800,39 @@ def __init__(
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            res_hidden_states_tuple,
-            temb=None,
-            encoder_hidden_states=None,
-            cross_attention_kwargs=None,
-            upsample_size=None,
-            attention_mask=None, ):
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        cross_attention_kwargs=None,
+        upsample_size=None,
+        attention_mask=None,
+    ):
         res_hidden_states_tuple = res_hidden_states_tuple[-1]
         if res_hidden_states_tuple is not None:
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states_tuple], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
 
         for resnet, attn in zip(self.resnets, self.attentions):
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
                         if return_dict is not None:
-                            return module(
-                                *inputs, return_dict=return_dict)[0]  # move [0]
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0]
                         else:
                             return module(*inputs)
 
                     return custom_forward
 
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(
-                        attn, return_dict=False),
+                    create_custom_forward(attn, return_dict=False),
                     hidden_states,
                     encoder_hidden_states,
                     attention_mask,
-                    cross_attention_kwargs, )  # [0]
+                    cross_attention_kwargs,
+                )  # [0]
             else:
                 hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
@@ -2755,7 +2840,8 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     emb=temb,
                     attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs, )
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2783,25 +2869,25 @@ class KAttentionBlock(nn.Layer):
     """
 
     def __init__(
-            self,
-            dim: int,
-            num_attention_heads: int,
-            attention_head_dim: int,
-            dropout: float=0.0,
-            cross_attention_dim: Optional[int]=None,
-            attention_bias: bool=False,
-            upcast_attention: bool=False,
-            temb_channels: int=768,  # for ada_group_norm
-            add_self_attention: bool=False,
-            cross_attention_norm: Optional[str]=None,
-            group_size: int=32, ):
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        temb_channels: int = 768,  # for ada_group_norm
+        add_self_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        group_size: int = 32,
+    ):
         super().__init__()
         self.add_self_attention = add_self_attention
 
         # 1. Self-Attn
         if add_self_attention:
-            self.norm1 = AdaGroupNorm(temb_channels, dim,
-                                      max(1, dim // group_size))
+            self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
             self.attn1 = Attention(
                 query_dim=dim,
                 heads=num_attention_heads,
@@ -2809,7 +2895,8 @@ def __init__(
                 dropout=dropout,
                 bias=attention_bias,
                 cross_attention_dim=None,
-                cross_attention_norm=None, )
+                cross_attention_norm=None,
+            )
 
         # 2. Cross-Attn
         self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
@@ -2821,25 +2908,24 @@ def __init__(
             dropout=dropout,
             bias=attention_bias,
             upcast_attention=upcast_attention,
-            cross_attention_norm=cross_attention_norm, )
+            cross_attention_norm=cross_attention_norm,
+        )
 
     def _to_3d(self, hidden_states, height, weight):
-        return hidden_states.transpose([0, 2, 3, 1]).reshape(
-            [hidden_states.shape[0], height * weight, -1])
+        return hidden_states.transpose([0, 2, 3, 1]).reshape([hidden_states.shape[0], height * weight, -1])
 
     def _to_4d(self, hidden_states, height, weight):
-        return hidden_states.transpose([0, 2, 1]).reshape(
-            [hidden_states.shape[0], -1, height, weight])
+        return hidden_states.transpose([0, 2, 1]).reshape([hidden_states.shape[0], -1, height, weight])
 
     def forward(
-            self,
-            hidden_states,
-            encoder_hidden_states=None,
-            emb=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
-        cross_attention_kwargs = (cross_attention_kwargs if
-                                  cross_attention_kwargs is not None else {})
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        emb=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
 
         # 1. Self-Attention
         if self.add_self_attention:
@@ -2851,7 +2937,8 @@ def forward(
             attn_output = self.attn1(
                 norm_hidden_states,
                 encoder_hidden_states=None,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
             attn_output = self._to_4d(attn_output, height, weight)
 
             hidden_states = attn_output + hidden_states
@@ -2864,7 +2951,8 @@ def forward(
         attn_output = self.attn2(
             norm_hidden_states,
             encoder_hidden_states=encoder_hidden_states,
-            **cross_attention_kwargs, )
+            **cross_attention_kwargs,
+        )
         attn_output = self._to_4d(attn_output, height, weight)
 
         hidden_states = attn_output + hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
index 173a1185da9a8..606a6f0b91ba5 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
@@ -23,13 +23,23 @@
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import NEG_INF, BaseOutput, logging
 from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import (GaussianFourierProjection, TextTimeEmbedding,
-                         TimestepEmbedding, Timesteps)
+from .embeddings import (
+    GaussianFourierProjection,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
 from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (CrossAttnDownBlock2D, CrossAttnUpBlock2D,
-                             DownBlock2D, UNetMidBlock2DCrossAttn,
-                             UNetMidBlock2DSimpleCrossAttn, UpBlock2D,
-                             get_down_block, get_up_block)
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -45,8 +55,7 @@ class UNet2DConditionOutput(BaseOutput):
     sample: paddle.Tensor
 
 
-class UNet2DConditionModel(ModelMixin, ConfigMixin,
-                           UNet2DConditionLoadersMixin):
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
     UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
     and returns sample shaped output.
@@ -126,57 +135,60 @@ class conditioning with `class_embed_type` equal to `None`.
 
     @register_to_config
     def __init__(
-            self,
-            sample_size: Optional[int]=None,
-            in_channels: int=4,
-            out_channels: int=4,
-            center_input_sample: bool=False,
-            flip_sin_to_cos: bool=True,
-            freq_shift: int=0,
-            down_block_types: Tuple[str]=(
-                "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D",
-                "DownBlock2D", ),
-            mid_block_type: Optional[str]="UNetMidBlock2DCrossAttn",
-            up_block_types: Tuple[str]=(
-                "UpBlock2D",
-                "CrossAttnUpBlock2D",
-                "CrossAttnUpBlock2D",
-                "CrossAttnUpBlock2D", ),
-            only_cross_attention: Union[bool, Tuple[bool]]=False,
-            block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
-            layers_per_block: Union[int, Tuple[int]]=2,
-            downsample_padding: int=1,
-            mid_block_scale_factor: float=1,
-            act_fn: str="silu",
-            norm_num_groups: Optional[int]=32,
-            norm_eps: float=1e-5,
-            cross_attention_dim: Union[int, Tuple[int]]=1280,
-            encoder_hid_dim: Optional[int]=None,
-            attention_head_dim: Union[int, Tuple[int]]=8,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            class_embed_type: Optional[str]=None,
-            addition_embed_type: Optional[str]=None,
-            num_class_embeds: Optional[int]=None,
-            upcast_attention: bool=False,
-            resnet_time_scale_shift: str="default",
-            resnet_skip_time_act: bool=False,
-            resnet_out_scale_factor: int=1.0,
-            time_embedding_type: str="positional",  # fourier, positional
-            time_embedding_dim: Optional[int]=None,
-            time_embedding_act_fn: Optional[str]=None,
-            timestep_post_act: Optional[str]=None,
-            time_cond_proj_dim: Optional[int]=None,
-            conv_in_kernel: int=3,
-            conv_out_kernel: int=3,
-            projection_class_embeddings_input_dim: Optional[int]=None,
-            class_embeddings_concat: bool=False,
-            mid_block_only_cross_attention: Optional[bool]=None,
-            cross_attention_norm: Optional[str]=None,
-            resnet_pre_temb_non_linearity: Optional[bool]=False,
-            addition_embed_type_num_heads: int=64, ):
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        encoder_hid_dim: Optional[int] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",  # fourier, positional
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        resnet_pre_temb_non_linearity: Optional[bool] = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
         super().__init__()
 
         self.sample_size = sample_size
@@ -192,30 +204,22 @@ def __init__(
                 f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                only_cross_attention,
-                bool) and len(only_cross_attention) != len(down_block_types):
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                attention_head_dim,
-                int) and len(attention_head_dim) != len(down_block_types):
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
             )
 
-        if isinstance(
-                cross_attention_dim,
-                list) and len(cross_attention_dim) != len(down_block_types):
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                layers_per_block,
-                int) and len(layers_per_block) != len(down_block_types):
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
             )
@@ -226,26 +230,25 @@ def __init__(
             in_channels,
             block_out_channels[0],
             kernel_size=conv_in_kernel,
-            padding=conv_in_padding, )
+            padding=conv_in_padding,
+        )
 
         # time
         if time_embedding_type == "fourier":
             time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
             if time_embed_dim % 2 != 0:
-                raise ValueError(
-                    f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
-                )
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
             self.time_proj = GaussianFourierProjection(
                 time_embed_dim // 2,
                 set_W_to_weight=False,
                 log=False,
-                flip_sin_to_cos=flip_sin_to_cos, )
+                flip_sin_to_cos=flip_sin_to_cos,
+            )
             timestep_input_dim = time_embed_dim
         elif time_embedding_type == "positional":
             time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
 
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
-                                       freq_shift)
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
         else:
             raise ValueError(
@@ -257,21 +260,19 @@ def __init__(
             time_embed_dim,
             act_fn=act_fn,
             post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim, )
+            cond_proj_dim=time_cond_proj_dim,
+        )
 
         if encoder_hid_dim is not None:
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim,
-                                              cross_attention_dim)
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
         else:
             self.encoder_hid_proj = None
 
         # class embedding
         if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds,
-                                                time_embed_dim)  # int64
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)  # int64
         elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(
-                timestep_input_dim, time_embed_dim, act_fn=act_fn)  # float
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)  # float
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
         elif class_embed_type == "projection":
@@ -286,15 +287,13 @@ def __init__(
             # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
             # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(
-                projection_class_embeddings_input_dim, time_embed_dim)  # float
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)  # float
         elif class_embed_type == "simple_projection":
             if projection_class_embeddings_input_dim is None:
                 raise ValueError(
                     "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
                 )
-            self.class_embedding = nn.Linear(
-                projection_class_embeddings_input_dim, time_embed_dim)
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
 
@@ -307,11 +306,10 @@ def __init__(
             self.add_embedding = TextTimeEmbedding(
                 text_time_embedding_from_dim,
                 time_embed_dim,
-                num_heads=addition_embed_type_num_heads, )
-        elif addition_embed_type is not None:
-            raise ValueError(
-                f"addition_embed_type: {addition_embed_type} must be None or 'text'."
+                num_heads=addition_embed_type_num_heads,
             )
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
 
         if time_embedding_act_fn is None:
             self.time_embed_act = None
@@ -324,8 +322,7 @@ def __init__(
         elif time_embedding_act_fn == "gelu":
             self.time_embed_act = nn.GELU()
         else:
-            raise ValueError(
-                f"Unsupported activation function: {time_embedding_act_fn}")
+            raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
 
         self.down_blocks = nn.LayerList([])
         self.up_blocks = nn.LayerList([])
@@ -333,18 +330,16 @@ def __init__(
         if isinstance(only_cross_attention, bool):
             if mid_block_only_cross_attention is None:
                 mid_block_only_cross_attention = only_cross_attention
-            only_cross_attention = [only_cross_attention] * len(
-                down_block_types)
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
 
         if mid_block_only_cross_attention is None:
             mid_block_only_cross_attention = False
 
         if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
         if isinstance(cross_attention_dim, int):
-            cross_attention_dim = (
-                cross_attention_dim, ) * len(down_block_types)
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
 
         if isinstance(layers_per_block, int):
             layers_per_block = [layers_per_block] * len(down_block_types)
@@ -397,7 +392,8 @@ def __init__(
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
             self.down_blocks.append(down_block)
 
         # mid
@@ -415,7 +411,8 @@ def __init__(
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
                 upcast_attention=upcast_attention,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
             self.mid_block = UNetMidBlock2DSimpleCrossAttn(
                 in_channels=block_out_channels[-1],
@@ -430,7 +427,8 @@ def __init__(
                 skip_time_act=resnet_skip_time_act,
                 only_cross_attention=mid_block_only_cross_attention,
                 cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         elif mid_block_type is None:
             self.mid_block = None
         else:
@@ -452,8 +450,7 @@ def __init__(
 
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(
-                i + 1, len(block_out_channels) - 1)]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
 
             # add upsample block for all BUT final layer
             if not is_final_block:
@@ -483,7 +480,8 @@ def __init__(
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
@@ -492,7 +490,8 @@ def __init__(
             self.conv_norm_out = nn.GroupNorm(
                 num_channels=block_out_channels[0],
                 num_groups=norm_num_groups,
-                epsilon=norm_eps, )
+                epsilon=norm_eps,
+            )
             if act_fn == "swish":
                 self.conv_act = lambda x: F.silu(x)
             elif act_fn == "mish":
@@ -512,7 +511,8 @@ def __init__(
             block_out_channels[0],
             out_channels,
             kernel_size=conv_out_kernel,
-            padding=conv_out_padding, )
+            padding=conv_out_padding,
+        )
 
     @property
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -524,16 +524,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         # set recursively
         processors = {}
 
-        def fn_recursive_add_processors(
-                name: str,
-                module: nn.Layer,
-                processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
             if hasattr(module, "set_processor"):
                 processors[f"{name}.processor"] = module.processor
 
             for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child,
-                                            processors)
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
 
             return processors
 
@@ -542,9 +538,7 @@ def fn_recursive_add_processors(
 
         return processors
 
-    def set_attn_processor(self,
-                           processor: Union[AttentionProcessor, Dict[
-                               str, AttentionProcessor]]):
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
         Parameters:
             `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -569,8 +563,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
                     module.set_processor(processor.pop(f"{name}.processor"))
 
             for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child,
-                                            processor)
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
 
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
@@ -618,8 +611,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             # make smallest slice possible
             slice_size = num_sliceable_layers * [1]
 
-        slice_size = (num_sliceable_layers * [slice_size]
-                      if not isinstance(slice_size, list) else slice_size)
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
 
         if len(slice_size) != len(sliceable_head_dims):
             raise ValueError(
@@ -631,14 +623,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             size = slice_size[i]
             dim = sliceable_head_dims[i]
             if size is not None and size > dim:
-                raise ValueError(
-                    f"size {size} has to be smaller or equal to {dim}.")
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
 
         # Recursively walk through all the children.
         # Any children which exposes the set_attention_slice method
         # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer,
-                                             slice_size: List[int]):
+        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
             if hasattr(module, "set_attention_slice"):
                 module.set_attention_slice(slice_size.pop())
 
@@ -650,24 +640,22 @@ def fn_recursive_set_attention_slice(module: nn.Layer,
             fn_recursive_set_attention_slice(module, reversed_slice_size)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(
-                module,
-            (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
             module.gradient_checkpointing = value
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            encoder_hidden_states: paddle.Tensor,
-            class_labels: Optional[paddle.Tensor]=None,
-            timestep_cond: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            down_block_additional_residuals: Optional[Tuple[
-                paddle.Tensor]]=None,
-            mid_block_additional_residual: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ) -> Union[UNet2DConditionOutput, Tuple]:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+        mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
         Args:
             sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
@@ -699,8 +687,7 @@ def forward(
         upsample_size = None
 
         if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info(
-                "Forward upsample size to force interpolation output size.")
+            logger.info("Forward upsample size to force interpolation output size.")
             forward_upsample_size = True
 
         # prepare attention_mask
@@ -720,7 +707,11 @@ def forward(
             timesteps = timesteps[None]
 
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand([sample.shape[0], ])
+        timesteps = timesteps.expand(
+            [
+                sample.shape[0],
+            ]
+        )
         t_emb = self.time_proj(timesteps)
 
         # `Timesteps` does not contain any weights and will always return f32 tensors
@@ -732,8 +723,7 @@ def forward(
 
         if self.class_embedding is not None:
             if class_labels is None:
-                raise ValueError(
-                    "class_labels should be provided when num_class_embeds > 0")
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
 
             # maybe cast it to float16
             class_labels = class_labels.cast(self.dtype)
@@ -771,21 +761,16 @@ def forward(
 
         # 3. down
 
-        is_controlnet = (mid_block_additional_residual is not None and
-                         down_block_additional_residuals is not None)
-        is_adapter = (mid_block_additional_residual is None and
-                      down_block_additional_residuals is not None)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
 
-        down_block_res_samples = (sample, )
+        down_block_res_samples = (sample,)
 
         for downsample_block in self.down_blocks:
-            if (hasattr(downsample_block, "has_cross_attention") and
-                    downsample_block.has_cross_attention):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                 additional_kwargs = {}
                 if is_adapter and len(down_block_additional_residuals) > 0:
-                    additional_kwargs[
-                        "additional_residuals"] = down_block_additional_residuals.pop(
-                            0)
+                    additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0)
 
                 sample, res_samples = downsample_block(
                     hidden_states=sample,
@@ -793,25 +778,25 @@ def forward(
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    **additional_kwargs, )
+                    **additional_kwargs,
+                )
             else:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
                 if is_adapter and len(down_block_additional_residuals) > 0:
                     sample += down_block_additional_residuals.pop(0)
                     # westfish: add to align with torch features
-                    res_samples = tuple(res_samples[:-1]) + (sample, )
+                    res_samples = tuple(res_samples[:-1]) + (sample,)
             down_block_res_samples += res_samples
 
         if is_controlnet:
             new_down_block_res_samples = ()
 
             for down_block_res_sample, down_block_additional_residual in zip(
-                    down_block_res_samples, down_block_additional_residuals):
-                down_block_res_sample = (
-                    down_block_res_sample + down_block_additional_residual)
-                new_down_block_res_samples += (down_block_res_sample, )
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
             down_block_res_samples = new_down_block_res_samples
 
         # 4. mid
@@ -821,7 +806,8 @@ def forward(
                 emb,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs, )
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
 
         if is_controlnet:
             sample = sample + mid_block_additional_residual
@@ -830,17 +816,15 @@ def forward(
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
 
-            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
-            down_block_res_samples = down_block_res_samples[:-len(
-                upsample_block.resnets)]
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
                 upsample_size = down_block_res_samples[-1].shape[2:]
 
-            if (hasattr(upsample_block, "has_cross_attention") and
-                    upsample_block.has_cross_attention):
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
                 sample = upsample_block(
                     hidden_states=sample,
                     temb=emb,
@@ -848,13 +832,15 @@ def forward(
                     encoder_hidden_states=encoder_hidden_states,
                     cross_attention_kwargs=cross_attention_kwargs,
                     upsample_size=upsample_size,
-                    attention_mask=attention_mask, )
+                    attention_mask=attention_mask,
+                )
             else:
                 sample = upsample_block(
                     hidden_states=sample,
                     temb=emb,
                     res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size, )
+                    upsample_size=upsample_size,
+                )
 
         # 6. post-process
         if self.conv_norm_out:
@@ -863,6 +849,6 @@ def forward(
         sample = self.conv_out(sample)
 
         if not return_dict:
-            return (sample, )
+            return (sample,)
 
         return UNet2DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
index f3feb516342c7..5e55038b49714 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
@@ -22,23 +22,24 @@
 
 
 def get_down_block(
-        down_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        temb_channels,
-        add_downsample,
-        resnet_eps,
-        resnet_act_fn,
-        attn_num_head_channels,
-        resnet_groups=None,
-        cross_attention_dim=None,
-        downsample_padding=None,
-        dual_cross_attention=False,
-        use_linear_projection=True,
-        only_cross_attention=False,
-        upcast_attention=False,
-        resnet_time_scale_shift="default", ):
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=True,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
     if down_block_type == "DownBlock3D":
         return DownBlock3D(
             num_layers=num_layers,
@@ -50,11 +51,11 @@ def get_down_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift, )
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
     elif down_block_type == "CrossAttnDownBlock3D":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnDownBlock3D")
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
         return CrossAttnDownBlock3D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -71,28 +72,30 @@ def get_down_block(
             use_linear_projection=use_linear_projection,
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift, )
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
     raise ValueError(f"{down_block_type} does not exist.")
 
 
 def get_up_block(
-        up_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        prev_output_channel,
-        temb_channels,
-        add_upsample,
-        resnet_eps,
-        resnet_act_fn,
-        attn_num_head_channels,
-        resnet_groups=None,
-        cross_attention_dim=None,
-        dual_cross_attention=False,
-        use_linear_projection=True,
-        only_cross_attention=False,
-        upcast_attention=False,
-        resnet_time_scale_shift="default", ):
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=True,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
     if up_block_type == "UpBlock3D":
         return UpBlock3D(
             num_layers=num_layers,
@@ -104,11 +107,11 @@ def get_up_block(
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift, )
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
     elif up_block_type == "CrossAttnUpBlock3D":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnUpBlock3D")
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
         return CrossAttnUpBlock3D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -125,33 +128,34 @@ def get_up_block(
             use_linear_projection=use_linear_projection,
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift, )
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
     raise ValueError(f"{up_block_type} does not exist.")
 
 
 class UNetMidBlock3DCrossAttn(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-06,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels=1,
-            output_scale_factor=1.0,
-            cross_attention_dim=1280,
-            dual_cross_attention=False,
-            use_linear_projection=True,
-            upcast_attention=False, ):
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=True,
+        upcast_attention=False,
+    ):
         super().__init__()
         self.has_cross_attention = True
         self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (resnet_groups if resnet_groups is not None else
-                         min(in_channels // 4, 32))
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         # there is always at least one resnet
         resnets = [
             ResnetBlock2D(
@@ -164,13 +168,15 @@ def __init__(
                 time_embedding_norm=resnet_time_scale_shift,
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm, )
+                pre_norm=resnet_pre_norm,
+            )
         ]
         temp_convs = [
             TemporalConvLayer(
                 in_channels,
                 in_channels,
-                dropout=0.1, )
+                dropout=0.1,
+            )
         ]
         attentions = []
         temp_attentions = []
@@ -184,7 +190,9 @@ def __init__(
                     cross_attention_dim=cross_attention_dim,
                     norm_num_groups=resnet_groups,
                     use_linear_projection=use_linear_projection,
-                    upcast_attention=upcast_attention, ))
+                    upcast_attention=upcast_attention,
+                )
+            )
             temp_attentions.append(
                 TransformerTemporalModel(
                     in_channels // attn_num_head_channels,
@@ -192,7 +200,9 @@ def __init__(
                     in_channels=in_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
             resnets.append(
                 ResnetBlock2D(
                     in_channels=in_channels,
@@ -204,38 +214,45 @@ def __init__(
                     time_embedding_norm=resnet_time_scale_shift,
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm, ))
+                    pre_norm=resnet_pre_norm,
+                )
+            )
             temp_convs.append(
                 TemporalConvLayer(
                     in_channels,
                     in_channels,
-                    dropout=0.1, ))
+                    dropout=0.1,
+                )
+            )
         self.resnets = nn.LayerList(resnets)
         self.temp_convs = nn.LayerList(temp_convs)
         self.attentions = nn.LayerList(attentions)
         self.temp_attentions = nn.LayerList(temp_attentions)
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            num_frames=1,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
         hidden_states = self.resnets[0](hidden_states, temb)
         hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
         for attn, temp_attn, resnet, temp_conv in zip(
-                self.attentions, self.temp_attentions, self.resnets[1:],
-                self.temp_convs[1:]):
+            self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
+        ):
             hidden_states = attn(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
             hidden_states = temp_attn(
                 hidden_states,
                 num_frames=num_frames,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
         return hidden_states
@@ -243,26 +260,27 @@ def forward(
 
 class CrossAttnDownBlock3D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-06,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels=1,
-            cross_attention_dim=1280,
-            output_scale_factor=1.0,
-            downsample_padding=1,
-            add_downsample=True,
-            dual_cross_attention=False,
-            use_linear_projection=False,
-            only_cross_attention=False,
-            upcast_attention=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -283,12 +301,16 @@ def __init__(
                     time_embedding_norm=resnet_time_scale_shift,
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm, ))
+                    pre_norm=resnet_pre_norm,
+                )
+            )
             temp_convs.append(
                 TemporalConvLayer(
                     out_channels,
                     out_channels,
-                    dropout=0.1, ))
+                    dropout=0.1,
+                )
+            )
             attentions.append(
                 Transformer2DModel(
                     out_channels // attn_num_head_channels,
@@ -299,7 +321,9 @@ def __init__(
                     norm_num_groups=resnet_groups,
                     use_linear_projection=use_linear_projection,
                     only_cross_attention=only_cross_attention,
-                    upcast_attention=upcast_attention, ))
+                    upcast_attention=upcast_attention,
+                )
+            )
             temp_attentions.append(
                 TransformerTemporalModel(
                     out_channels // attn_num_head_channels,
@@ -307,70 +331,79 @@ def __init__(
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
         self.resnets = nn.LayerList(resnets)
         self.temp_convs = nn.LayerList(temp_convs)
         self.attentions = nn.LayerList(attentions)
         self.temp_attentions = nn.LayerList(temp_attentions)
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            num_frames=1,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
         output_states = ()
         for resnet, temp_conv, attn, temp_attn in zip(
-                self.resnets, self.temp_convs, self.attentions,
-                self.temp_attentions):
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
             hidden_states = attn(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
             hidden_states = temp_attn(
                 hidden_states,
                 num_frames=num_frames,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
-            output_states += (hidden_states, )
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+            output_states += (hidden_states,)
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
         return hidden_states, output_states
 
 
 class DownBlock3D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-06,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor=1.0,
-            add_downsample=True,
-            downsample_padding=1, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
         super().__init__()
         resnets = []
         temp_convs = []
@@ -387,23 +420,30 @@ def __init__(
                     time_embedding_norm=resnet_time_scale_shift,
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm, ))
+                    pre_norm=resnet_pre_norm,
+                )
+            )
             temp_convs.append(
                 TemporalConvLayer(
                     out_channels,
                     out_channels,
-                    dropout=0.1, ))
+                    dropout=0.1,
+                )
+            )
         self.resnets = nn.LayerList(resnets)
         self.temp_convs = nn.LayerList(temp_convs)
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                Downsample2D(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
         self.gradient_checkpointing = False
@@ -413,36 +453,37 @@ def forward(self, hidden_states, temb=None, num_frames=1):
         for resnet, temp_conv in zip(self.resnets, self.temp_convs):
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
         return hidden_states, output_states
 
 
 class CrossAttnUpBlock3D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            prev_output_channel: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-06,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels=1,
-            cross_attention_dim=1280,
-            output_scale_factor=1.0,
-            add_upsample=True,
-            dual_cross_attention=False,
-            use_linear_projection=False,
-            only_cross_attention=False,
-            upcast_attention=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
         super().__init__()
         resnets = []
         temp_convs = []
@@ -451,8 +492,7 @@ def __init__(
         self.has_cross_attention = True
         self.attn_num_head_channels = attn_num_head_channels
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
                 ResnetBlock2D(
@@ -465,12 +505,16 @@ def __init__(
                     time_embedding_norm=resnet_time_scale_shift,
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm, ))
+                    pre_norm=resnet_pre_norm,
+                )
+            )
             temp_convs.append(
                 TemporalConvLayer(
                     out_channels,
                     out_channels,
-                    dropout=0.1, ))
+                    dropout=0.1,
+                )
+            )
             attentions.append(
                 Transformer2DModel(
                     out_channels // attn_num_head_channels,
@@ -481,7 +525,9 @@ def __init__(
                     norm_num_groups=resnet_groups,
                     use_linear_projection=use_linear_projection,
                     only_cross_attention=only_cross_attention,
-                    upcast_attention=upcast_attention, ))
+                    upcast_attention=upcast_attention,
+                )
+            )
             temp_attentions.append(
                 TransformerTemporalModel(
                     out_channels // attn_num_head_channels,
@@ -489,48 +535,51 @@ def __init__(
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups, ))
+                    norm_num_groups=resnet_groups,
+                )
+            )
         self.resnets = nn.LayerList(sublayers=resnets)
         self.temp_convs = nn.LayerList(sublayers=temp_convs)
         self.attentions = nn.LayerList(sublayers=attentions)
         self.temp_attentions = nn.LayerList(sublayers=temp_attentions)
         if add_upsample:
-            self.upsamplers = nn.LayerList(sublayers=[
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList(
+                sublayers=[Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
         else:
             self.upsamplers = None
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            res_hidden_states_tuple,
-            temb=None,
-            encoder_hidden_states=None,
-            upsample_size=None,
-            attention_mask=None,
-            num_frames=1,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
         for resnet, temp_conv, attn, temp_attn in zip(
-                self.resnets, self.temp_convs, self.attentions,
-                self.temp_attentions):
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
             hidden_states = attn(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
             hidden_states = temp_attn(
                 hidden_states,
                 num_frames=num_frames,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
                 hidden_states = upsampler(hidden_states, upsample_size)
@@ -539,26 +588,26 @@ def forward(
 
 class UpBlock3D(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-06,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor=1.0,
-            add_upsample=True, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
         super().__init__()
         resnets = []
         temp_convs = []
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
                 ResnetBlock2D(
@@ -571,36 +620,37 @@ def __init__(
                     time_embedding_norm=resnet_time_scale_shift,
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm, ))
+                    pre_norm=resnet_pre_norm,
+                )
+            )
             temp_convs.append(
                 TemporalConvLayer(
                     out_channels,
                     out_channels,
-                    dropout=0.1, ))
+                    dropout=0.1,
+                )
+            )
         self.resnets = nn.LayerList(resnets)
         self.temp_convs = nn.LayerList(temp_convs)
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                Upsample2D(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            res_hidden_states_tuple,
-            temb=None,
-            upsample_size=None,
-            num_frames=1, ):
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        upsample_size=None,
+        num_frames=1,
+    ):
         for resnet, temp_conv in zip(self.resnets, self.temp_convs):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                x=[hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
         if self.upsamplers is not None:
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
index fb8ae5756d4c3..038e8c6d514a7 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
@@ -26,9 +26,15 @@
 from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .transformer_temporal import TransformerTemporalModel
-from .unet_3d_blocks import (CrossAttnDownBlock3D, CrossAttnUpBlock3D,
-                             DownBlock3D, UNetMidBlock3DCrossAttn, UpBlock3D,
-                             get_down_block, get_up_block)
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -44,8 +50,7 @@ class UNet3DConditionOutput(BaseOutput):
     sample: paddle.Tensor
 
 
-class UNet3DConditionModel(ModelMixin, ConfigMixin,
-                           UNet2DConditionLoadersMixin):
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
     UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
     and returns sample shaped output.
@@ -79,29 +84,32 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin,
 
     @register_to_config
     def __init__(
-            self,
-            sample_size: Optional[int]=None,
-            in_channels: int=4,
-            out_channels: int=4,
-            down_block_types: Tuple[str]=(
-                "CrossAttnDownBlock3D",
-                "CrossAttnDownBlock3D",
-                "CrossAttnDownBlock3D",
-                "DownBlock3D", ),
-            up_block_types: Tuple[str]=(
-                "UpBlock3D",
-                "CrossAttnUpBlock3D",
-                "CrossAttnUpBlock3D",
-                "CrossAttnUpBlock3D", ),
-            block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
-            layers_per_block: int=2,
-            downsample_padding: int=1,
-            mid_block_scale_factor: float=1,
-            act_fn: str="silu",
-            norm_num_groups: Optional[int]=32,
-            norm_eps: float=1e-05,
-            cross_attention_dim: int=1024,
-            attention_head_dim: Union[int, Tuple[int]]=64, ):
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-05,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+    ):
         super().__init__()
         self.sample_size = sample_size
         # Check inputs
@@ -113,9 +121,7 @@ def __init__(
             raise ValueError(
                 f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
             )
-        if not isinstance(
-                attention_head_dim,
-                int) and len(attention_head_dim) != len(down_block_types):
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
             raise ValueError(
                 f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
             )
@@ -126,7 +132,8 @@ def __init__(
             in_channels=in_channels,
             out_channels=block_out_channels[0],
             kernel_size=conv_in_kernel,
-            padding=conv_in_padding, )
+            padding=conv_in_padding,
+        )
         # time
         time_embed_dim = block_out_channels[0] * 4
         self.time_proj = Timesteps(block_out_channels[0], True, 0)
@@ -134,17 +141,19 @@ def __init__(
         self.time_embedding = TimestepEmbedding(
             timestep_input_dim,
             time_embed_dim,
-            act_fn=act_fn, )
+            act_fn=act_fn,
+        )
         self.transformer_in = TransformerTemporalModel(
             num_attention_heads=8,
             attention_head_dim=attention_head_dim,
             in_channels=block_out_channels[0],
-            num_layers=1, )
+            num_layers=1,
+        )
         # class embedding
         self.down_blocks = nn.LayerList(sublayers=[])
         self.up_blocks = nn.LayerList(sublayers=[])
         if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
         # down
         output_channel = block_out_channels[0]
@@ -165,7 +174,8 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
-                dual_cross_attention=False, )
+                dual_cross_attention=False,
+            )
             self.down_blocks.append(down_block)
         # mid
         self.mid_block = UNetMidBlock3DCrossAttn(
@@ -177,7 +187,8 @@ def __init__(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
-            dual_cross_attention=False, )
+            dual_cross_attention=False,
+        )
         # count how many layers upsample the images
         self.num_upsamplers = 0
         # up
@@ -188,8 +199,7 @@ def __init__(
             is_final_block = i == len(block_out_channels) - 1
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(
-                i + 1, len(block_out_channels) - 1)]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
             # add upsample block for all BUT final layer
             if not is_final_block:
                 add_upsample = True
@@ -209,14 +219,16 @@ def __init__(
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=reversed_attention_head_dim[i],
-                dual_cross_attention=False, )
+                dual_cross_attention=False,
+            )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
         if norm_num_groups is not None:
             self.conv_norm_out = nn.GroupNorm(
                 num_channels=block_out_channels[0],
                 num_groups=norm_num_groups,
-                epsilon=norm_eps, )
+                epsilon=norm_eps,
+            )
             self.conv_act = nn.Silu()
         else:
             self.conv_norm_out = None
@@ -226,7 +238,8 @@ def __init__(
             in_channels=block_out_channels[0],
             out_channels=out_channels,
             kernel_size=conv_out_kernel,
-            padding=conv_out_padding, )
+            padding=conv_out_padding,
+        )
 
     @property
     # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
@@ -239,16 +252,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         # set recursively
         processors = {}
 
-        def fn_recursive_add_processors(
-                name: str,
-                module: nn.Layer,
-                processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
             if hasattr(module, "set_processor"):
                 processors[f"{name}.processor"] = module.processor
 
             for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child,
-                                            processors)
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
 
             return processors
 
@@ -295,8 +304,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             # make smallest slice possible
             slice_size = num_sliceable_layers * [1]
 
-        slice_size = (num_sliceable_layers * [slice_size]
-                      if not isinstance(slice_size, list) else slice_size)
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
 
         if len(slice_size) != len(sliceable_head_dims):
             raise ValueError(
@@ -308,14 +316,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             size = slice_size[i]
             dim = sliceable_head_dims[i]
             if size is not None and size > dim:
-                raise ValueError(
-                    f"size {size} has to be smaller or equal to {dim}.")
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
 
         # Recursively walk through all the children.
         # Any children which exposes the set_attention_slice method
         # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer,
-                                             slice_size: List[int]):
+        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
             if hasattr(module, "set_attention_slice"):
                 module.set_attention_slice(slice_size.pop())
 
@@ -327,9 +333,7 @@ def fn_recursive_set_attention_slice(module: nn.Layer,
             fn_recursive_set_attention_slice(module, reversed_slice_size)
 
     # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self,
-                           processor: Union[AttentionProcessor, Dict[
-                               str, AttentionProcessor]]):
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
         Parameters:
             `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -354,8 +358,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
                     module.set_processor(processor.pop(f"{name}.processor"))
 
             for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child,
-                                            processor)
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
 
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
@@ -368,24 +371,22 @@ def set_default_attn_processor(self):
         self.set_attn_processor(AttnProcessor())
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(
-                module,
-            (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
             module.gradient_checkpointing = value
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            encoder_hidden_states: paddle.Tensor,
-            class_labels: Optional[paddle.Tensor]=None,
-            timestep_cond: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            down_block_additional_residuals: Optional[Tuple[
-                paddle.Tensor]]=None,
-            mid_block_additional_residual: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ) -> Union[UNet3DConditionOutput, Tuple]:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+        mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
         """
         Args:
             sample (`paddle.Tensor`): (batch, num_frames, channel, height, width) noisy inputs tensor
@@ -417,8 +418,7 @@ def forward(
         upsample_size = None
 
         if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info(
-                "Forward upsample size to force interpolation output size.")
+            logger.info("Forward upsample size to force interpolation output size.")
             forward_upsample_size = True
         # prepare attention_mask
         if attention_mask is not None:
@@ -436,7 +436,11 @@ def forward(
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None]
         num_frames = sample.shape[2]
-        timesteps = timesteps.expand([sample.shape[0], ])
+        timesteps = timesteps.expand(
+            [
+                sample.shape[0],
+            ]
+        )
         t_emb = self.time_proj(timesteps)
 
         # timesteps does not contain any weights and will always return f32 tensors
@@ -445,38 +449,36 @@ def forward(
         t_emb = t_emb.cast(dtype=self.dtype)
         emb = self.time_embedding(t_emb, timestep_cond)
         emb = emb.repeat_interleave(repeats=num_frames, axis=0)
-        encoder_hidden_states = encoder_hidden_states.repeat_interleave(
-            repeats=num_frames, axis=0)
-        sample = sample.transpose([0, 2, 1, 3, 4]).reshape((sample.shape[
-            0] * num_frames, -1) + tuple(sample.shape[3:]))
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
+        sample = sample.transpose([0, 2, 1, 3, 4]).reshape(
+            (sample.shape[0] * num_frames, -1) + tuple(sample.shape[3:])
+        )
         sample = self.conv_in(sample)
         sample = self.transformer_in(
-            sample,
-            num_frames=num_frames,
-            cross_attention_kwargs=cross_attention_kwargs).sample
+            sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
+        ).sample
         # 3. down
-        down_block_res_samples = (sample, )
+        down_block_res_samples = (sample,)
         for downsample_block in self.down_blocks:
-            if (hasattr(downsample_block, "has_cross_attention") and
-                    downsample_block.has_cross_attention):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                 sample, res_samples = downsample_block(
                     hidden_states=sample,
                     temb=emb,
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     num_frames=num_frames,
-                    cross_attention_kwargs=cross_attention_kwargs, )
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
             else:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb, num_frames=num_frames)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
             down_block_res_samples += res_samples
         if down_block_additional_residuals is not None:
             new_down_block_res_samples = ()
             for down_block_res_sample, down_block_additional_residual in zip(
-                    down_block_res_samples, down_block_additional_residuals):
-                down_block_res_sample = (
-                    down_block_res_sample + down_block_additional_residual)
-                new_down_block_res_samples += (down_block_res_sample, )
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
             down_block_res_samples = new_down_block_res_samples
         # 4. mid
         if self.mid_block is not None:
@@ -486,21 +488,20 @@ def forward(
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
                 num_frames=num_frames,
-                cross_attention_kwargs=cross_attention_kwargs, )
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
         if mid_block_additional_residual is not None:
             sample = sample + mid_block_additional_residual
         # 5. up
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
-            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
-            down_block_res_samples = down_block_res_samples[:-len(
-                upsample_block.resnets)]
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
                 upsample_size = down_block_res_samples[-1].shape[2:]
-            if (hasattr(upsample_block, "has_cross_attention") and
-                    upsample_block.has_cross_attention):
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
                 sample = upsample_block(
                     hidden_states=sample,
                     temb=emb,
@@ -509,23 +510,23 @@ def forward(
                     upsample_size=upsample_size,
                     attention_mask=attention_mask,
                     num_frames=num_frames,
-                    cross_attention_kwargs=cross_attention_kwargs, )
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
             else:
                 sample = upsample_block(
                     hidden_states=sample,
                     temb=emb,
                     res_hidden_states_tuple=res_samples,
                     upsample_size=upsample_size,
-                    num_frames=num_frames, )
+                    num_frames=num_frames,
+                )
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
             sample = self.conv_act(sample)
         sample = self.conv_out(sample)
         # reshape to (batch, channel, framerate, width, height)
-        sample = (sample[None, :]
-                  .reshape((-1, num_frames) + tuple(sample.shape[1:]))
-                  .transpose([0, 2, 1, 3, 4]))
+        sample = sample[None, :].reshape((-1, num_frames) + tuple(sample.shape[1:])).transpose([0, 2, 1, 3, 4])
         if not return_dict:
-            return (sample, )
+            return (sample,)
         return UNet3DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/uvit.py b/ppdiffusers/ppdiffusers/models/uvit.py
index f2140122e269f..eb7267d41d2a2 100644
--- a/ppdiffusers/ppdiffusers/models/uvit.py
+++ b/ppdiffusers/ppdiffusers/models/uvit.py
@@ -27,21 +27,15 @@
 
 
 def unpatchify(x, in_chans):
-    patch_size = int((x.shape[2] // in_chans)**0.5)
-    h = w = int(x.shape[1]**0.5)
-    assert h * w == x.shape[1] and patch_size**2 * in_chans == x.shape[2]
-    x = einops.rearrange(
-        x,
-        "B (h w) (p1 p2 C) -> B C (h p1) (w p2)",
-        h=h,
-        p1=patch_size,
-        p2=patch_size)
+    patch_size = int((x.shape[2] // in_chans) ** 0.5)
+    h = w = int(x.shape[1] ** 0.5)
+    assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2]
+    x = einops.rearrange(x, "B (h w) (p1 p2 C) -> B C (h p1) (w p2)", h=h, p1=patch_size, p2=patch_size)
     return x
 
 
 def interpolate_pos_emb(pos_emb, old_shape, new_shape):
-    pos_emb = einops.rearrange(
-        pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1])
+    pos_emb = einops.rearrange(pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1])
     pos_emb = F.interpolate(pos_emb, new_shape, mode="bilinear")
     pos_emb = einops.rearrange(pos_emb, "B C H W -> B (H W) C")
     return pos_emb
@@ -49,13 +43,14 @@ def interpolate_pos_emb(pos_emb, old_shape, new_shape):
 
 class Attention(nn.Layer):
     def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=False,
-            qk_scale=None,
-            attn_drop=0.0,
-            proj_drop=0.0, ):
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
@@ -82,9 +77,10 @@ def reshape_batch_dim_to_heads(self, tensor, transpose=True):
         return tensor
 
     def set_use_memory_efficient_attention_xformers(
-            self,
-            use_memory_efficient_attention_xformers: bool,
-            attention_op: Optional[str]=None, ):
+        self,
+        use_memory_efficient_attention_xformers: bool,
+        attention_op: Optional[str] = None,
+    ):
         # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045
         # if self.head_size > 128 and attention_op == "flash":
         #     attention_op = "cutlass"
@@ -96,18 +92,15 @@ def set_use_memory_efficient_attention_xformers(
             else:
                 try:
                     _ = F.scaled_dot_product_attention_(
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones(
-                            (1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op, )
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        attention_op=attention_op,
+                    )
                 except Exception as e:
                     raise e
 
-        self._use_memory_efficient_attention_xformers = (
-            use_memory_efficient_attention_xformers)
+        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
         self._attention_op = attention_op
 
     def forward(self, x):
@@ -116,14 +109,14 @@ def forward(self, x):
             qkv = qkv.cast(paddle.float32)
         query_proj, key_proj, value_proj = qkv.chunk(3, axis=-1)
         query_proj = self.reshape_heads_to_batch_dim(
-            query_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            query_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
         key_proj = self.reshape_heads_to_batch_dim(
-            key_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            key_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
         value_proj = self.reshape_heads_to_batch_dim(
-            value_proj,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            value_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
 
         if self._use_memory_efficient_attention_xformers:
             hidden_states = F.scaled_dot_product_attention_(
@@ -134,18 +127,17 @@ def forward(self, x):
                 scale=self.scale,
                 dropout_p=self.attn_drop,
                 training=self.training,
-                attention_op=self._attention_op, )
+                attention_op=self._attention_op,
+            )
         else:
             with paddle.amp.auto_cast(enable=False):
-                attention_scores = paddle.matmul(
-                    query_proj * self.scale, key_proj, transpose_y=True)
+                attention_scores = paddle.matmul(query_proj * self.scale, key_proj, transpose_y=True)
                 attention_probs = F.softmax(attention_scores, axis=-1)
-                hidden_states = paddle.matmul(attention_probs,
-                                              value_proj).cast(x.dtype)
+                hidden_states = paddle.matmul(attention_probs, value_proj).cast(x.dtype)
 
         hidden_states = self.reshape_batch_dim_to_heads(
-            hidden_states,
-            transpose=not self._use_memory_efficient_attention_xformers)
+            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+        )
 
         hidden_states = self.proj_drop(self.proj(hidden_states))
         return hidden_states
@@ -153,18 +145,19 @@ def forward(self, x):
 
 class Block(nn.Layer):
     def __init__(
-            self,
-            dim,
-            num_heads,
-            mlp_ratio=4.0,
-            qkv_bias=False,
-            qk_scale=None,
-            drop=0.0,
-            attn_drop=0.0,
-            drop_path=0.0,
-            act_layer=nn.GELU,
-            norm_layer=nn.LayerNorm,
-            skip=False, ):
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        skip=False,
+    ):
         super().__init__()
         self.norm1 = norm_layer(dim) if skip else None
         self.norm2 = norm_layer(dim)
@@ -175,16 +168,17 @@ def __init__(
             qkv_bias=qkv_bias,
             qk_scale=qk_scale,
             attn_drop=attn_drop,
-            proj_drop=drop, )
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.norm3 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
         self.mlp = Mlp(
             in_features=dim,
             hidden_features=mlp_hidden_dim,
             act_layer=act_layer,
-            drop=drop, )
+            drop=drop,
+        )
         self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
 
     def forward(self, x, skip=None):
@@ -223,44 +217,43 @@ class UViTModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            sample_size=1,
-            img_size=64,
-            in_channels=4,
-            patch_size=2,
-            embed_dim=1536,
-            depth=30,
-            num_heads=24,
-            mlp_ratio=4.0,
-            qkv_bias=False,
-            qk_scale=None,
-            pos_drop_rate=0.0,
-            drop_rate=0.0,
-            attn_drop_rate=0.0,
-            norm_type="layer_norm",
-            text_dim=64,
-            num_text_tokens=77,
-            clip_img_dim=512,
-            use_checkpoint=False, ):
+        self,
+        sample_size=1,
+        img_size=64,
+        in_channels=4,
+        patch_size=2,
+        embed_dim=1536,
+        depth=30,
+        num_heads=24,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        pos_drop_rate=0.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        norm_type="layer_norm",
+        text_dim=64,
+        num_text_tokens=77,
+        clip_img_dim=512,
+        use_checkpoint=False,
+    ):
         super().__init__()
         self.sample_size = sample_size
         self.in_channels = in_channels
         self.patch_size = patch_size
         self.embed_dim = embed_dim
 
-        self.img_size = (img_size, img_size) if isinstance(img_size,
-                                                           int) else img_size
+        self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size
         self.patch_embed = PatchEmbed(
             height=self.img_size[0],
             width=self.img_size[1],
             patch_size=patch_size,
             in_channels=in_channels,
             embed_dim=embed_dim,
-            add_pos_embed=False, )
-        assert self.img_size[0] % patch_size == 0 and self.img_size[
-            1] % patch_size == 0
-        self.num_patches = (self.img_size[0] // patch_size) * (
-            self.img_size[1] // patch_size)
+            add_pos_embed=False,
+        )
+        assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0
+        self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size)
 
         self.encode_prefix = nn.Linear(768, text_dim)
 
@@ -274,22 +267,27 @@ def __init__(
 
         self.pos_embed = self.create_parameter(
             shape=(1, self.num_tokens, embed_dim),
-            default_initializer=nn.initializer.Constant(0.0), )
+            default_initializer=nn.initializer.Constant(0.0),
+        )
         assert norm_type == "layer_norm", "We only support norm_type == layer_norm. "
         norm_layer = nn.LayerNorm
         self.pos_drop = nn.Dropout(p=pos_drop_rate)
 
-        self.in_blocks = nn.LayerList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                norm_layer=norm_layer, ) for _ in range(depth // 2)
-        ])
+        self.in_blocks = nn.LayerList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(depth // 2)
+            ]
+        )
 
         self.mid_block = Block(
             dim=embed_dim,
@@ -299,20 +297,25 @@ def __init__(
             qk_scale=qk_scale,
             drop=drop_rate,
             attn_drop=attn_drop_rate,
-            norm_layer=norm_layer, )
-
-        self.out_blocks = nn.LayerList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                norm_layer=norm_layer,
-                skip=True, ) for _ in range(depth // 2)
-        ])
+            norm_layer=norm_layer,
+        )
+
+        self.out_blocks = nn.LayerList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    norm_layer=norm_layer,
+                    skip=True,
+                )
+                for _ in range(depth // 2)
+            ]
+        )
 
         self.norm = norm_layer(embed_dim)
         self.patch_dim = patch_size**2 * in_channels
@@ -320,18 +323,18 @@ def __init__(
 
         self.token_embedding = nn.Embedding(2, embed_dim)
         self.pos_embed_token = self.create_parameter(
-            shape=(1, 1, embed_dim),
-            default_initializer=nn.initializer.Constant(0.0))
+            shape=(1, 1, embed_dim), default_initializer=nn.initializer.Constant(0.0)
+        )
 
     def forward(
-            self,
-            img: paddle.Tensor,
-            clip_img: paddle.Tensor,
-            text: paddle.Tensor,
-            t_img: paddle.Tensor,
-            t_text: paddle.Tensor,
-            data_type: paddle.Tensor,
-            return_dict=False,  # TODO: nf
+        self,
+        img: paddle.Tensor,
+        clip_img: paddle.Tensor,
+        text: paddle.Tensor,
+        t_img: paddle.Tensor,
+        t_text: paddle.Tensor,
+        data_type: paddle.Tensor,
+        return_dict=False,  # TODO: nf
     ):
         _, _, H, W = img.shape
         # TODO junnyu, support float16
@@ -343,10 +346,8 @@ def forward(
         clip_img = self.clip_img_embed(clip_img)
         text = self.text_embed(text)
 
-        t_img_token = get_timestep_embedding(t_img, self.embed_dim, True,
-                                             0).unsqueeze(axis=1)
-        t_text_token = get_timestep_embedding(t_text, self.embed_dim, True,
-                                              0).unsqueeze(axis=1)
+        t_img_token = get_timestep_embedding(t_img, self.embed_dim, True, 0).unsqueeze(axis=1)
+        t_text_token = get_timestep_embedding(t_text, self.embed_dim, True, 0).unsqueeze(axis=1)
         token_embed = self.token_embedding(data_type).unsqueeze(axis=1)
 
         # TODO junnyu, support float16
@@ -354,35 +355,35 @@ def forward(
         t_text_token = t_text_token.cast(self.dtype)
         token_embed = token_embed.cast(self.dtype)
 
-        x = paddle.concat(
-            (t_img_token, t_text_token, token_embed, text, clip_img, img),
-            axis=1)
+        x = paddle.concat((t_img_token, t_text_token, token_embed, text, clip_img, img), axis=1)
 
         num_text_tokens, num_img_tokens = text.shape[1], img.shape[1]
 
         pos_embed = paddle.concat(
             [
-                self.pos_embed[:, :1 + 1, :],
+                self.pos_embed[:, : 1 + 1, :],
                 self.pos_embed_token,
-                self.pos_embed[:, 1 + 1:, :],
+                self.pos_embed[:, 1 + 1 :, :],
             ],
-            axis=1, )
+            axis=1,
+        )
 
         if H == self.img_size[0] and W == self.img_size[1]:
             pass
         else:
             # interpolate the positional embedding when the input image is not of the default shape
             pos_embed_others, pos_embed_patches = paddle.split(
-                pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches],
-                axis=1)
+                pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], axis=1
+            )
             pos_embed_patches = interpolate_pos_emb(
                 pos_embed_patches,
                 (
                     self.img_size[0] // self.patch_size,
-                    self.img_size[1] // self.patch_size, ),
-                (H // self.patch_size, W // self.patch_size), )
-            pos_embed = paddle.concat(
-                (pos_embed_others, pos_embed_patches), axis=1)
+                    self.img_size[1] // self.patch_size,
+                ),
+                (H // self.patch_size, W // self.patch_size),
+            )
+            pos_embed = paddle.concat((pos_embed_others, pos_embed_patches), axis=1)
 
         x = x + pos_embed
         x = self.pos_drop(x)
@@ -405,8 +406,8 @@ def forward(
             token_embed_out,
             text_out,
             clip_img_out,
-            img_out, ) = x.split(
-                (1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1)
+            img_out,
+        ) = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1)
 
         img_out = self.decoder_pred(img_out)
         sample_img = unpatchify(img_out, self.in_channels)
@@ -419,4 +420,5 @@ def forward(
         return UViTModelOutput(
             sample_img=sample_img,
             sample_clip_img=sample_clip_img,
-            sample_text=sample_text, )
+            sample_text=sample_text,
+        )
diff --git a/ppdiffusers/ppdiffusers/models/vae.py b/ppdiffusers/ppdiffusers/models/vae.py
index f3b9a81b43a67..4b1fce10910a6 100644
--- a/ppdiffusers/ppdiffusers/models/vae.py
+++ b/ppdiffusers/ppdiffusers/models/vae.py
@@ -53,24 +53,20 @@ class DecoderOutput(BaseOutput):
 
 class Encoder(nn.Layer):
     def __init__(
-            self,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownEncoderBlock2D", ),
-            block_out_channels=(64, ),
-            layers_per_block=2,
-            norm_num_groups=32,
-            act_fn="silu",
-            double_z=True, ):
+        self,
+        in_channels=3,
+        out_channels=3,
+        down_block_types=("DownEncoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        norm_num_groups=32,
+        act_fn="silu",
+        double_z=True,
+    ):
         super().__init__()
         self.layers_per_block = layers_per_block
 
-        self.conv_in = nn.Conv2D(
-            in_channels,
-            block_out_channels[0],
-            kernel_size=3,
-            stride=1,
-            padding=1)
+        self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
 
         self.mid_block = None
         self.down_blocks = nn.LayerList([])
@@ -93,7 +89,8 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=None,
-                temb_channels=None, )
+                temb_channels=None,
+            )
             self.down_blocks.append(down_block)
 
         # mid
@@ -105,18 +102,19 @@ def __init__(
             resnet_time_scale_shift="default",
             attn_num_head_channels=None,
             resnet_groups=norm_num_groups,
-            temb_channels=None, )
+            temb_channels=None,
+        )
 
         # out
         self.conv_norm_out = nn.GroupNorm(
             num_channels=block_out_channels[-1],
             num_groups=norm_num_groups,
-            epsilon=1e-6, )
+            epsilon=1e-6,
+        )
         self.conv_act = nn.Silu()
 
         conv_out_channels = 2 * out_channels if double_z else out_channels
-        self.conv_out = nn.Conv2D(
-            block_out_channels[-1], conv_out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2D(block_out_channels[-1], conv_out_channels, 3, padding=1)
         self.gradient_checkpointing = False
 
     def forward(self, x):
@@ -156,23 +154,19 @@ def custom_forward(*inputs):
 
 class Decoder(nn.Layer):
     def __init__(
-            self,
-            in_channels=3,
-            out_channels=3,
-            up_block_types=("UpDecoderBlock2D", ),
-            block_out_channels=(64, ),
-            layers_per_block=2,
-            norm_num_groups=32,
-            act_fn="silu", ):
+        self,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        norm_num_groups=32,
+        act_fn="silu",
+    ):
         super().__init__()
         self.layers_per_block = layers_per_block
 
-        self.conv_in = nn.Conv2D(
-            in_channels,
-            block_out_channels[-1],
-            kernel_size=3,
-            stride=1,
-            padding=1)
+        self.conv_in = nn.Conv2D(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
 
         self.mid_block = None
         self.up_blocks = nn.LayerList([])
@@ -186,7 +180,8 @@ def __init__(
             resnet_time_scale_shift="default",
             attn_num_head_channels=None,
             resnet_groups=norm_num_groups,
-            temb_channels=None, )
+            temb_channels=None,
+        )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
@@ -208,18 +203,15 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=None,
-                temb_channels=None, )
+                temb_channels=None,
+            )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=norm_num_groups,
-            epsilon=1e-6)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=1e-6)
         self.conv_act = nn.Silu()
-        self.conv_out = nn.Conv2D(
-            block_out_channels[0], out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, 3, padding=1)
         self.gradient_checkpointing = False
 
     def forward(self, z):
@@ -255,8 +247,7 @@ def custom_forward(*inputs):
 
         # (TODO, junnyu) check nan
         # clamp inf values to enable fp16 training
-        if (amp_state() or
-                sample.dtype == paddle.float16) and paddle.isinf(sample).any():
+        if (amp_state() or sample.dtype == paddle.float16) and paddle.isinf(sample).any():
             clamp_value = finfo(sample.dtype).max - 1000
             sample = paddle.clip(sample, min=-clamp_value, max=clamp_value)
 
@@ -278,14 +269,15 @@ class VectorQuantizer(nn.Layer):
     # backwards compatibility we use the buggy version by default, but you can
     # specify legacy=False to fix it.
     def __init__(
-            self,
-            n_e,
-            vq_embed_dim,
-            beta,
-            remap=None,
-            unknown_index="random",
-            sane_index_shape=False,
-            legacy=True, ):
+        self,
+        n_e,
+        vq_embed_dim,
+        beta,
+        remap=None,
+        unknown_index="random",
+        sane_index_shape=False,
+        legacy=True,
+    ):
         super().__init__()
         self.n_e = n_e
         self.vq_embed_dim = vq_embed_dim
@@ -306,8 +298,10 @@ def __init__(
             if self.unknown_index == "extra":
                 self.unknown_index = self.re_embed
                 self.re_embed = self.re_embed + 1
-            print(f"Remapping {self.n_e} indices to {self.re_embed} indices. "
-                  f"Using {self.unknown_index} for unknown indices.")
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
         else:
             self.re_embed = n_e
 
@@ -322,8 +316,7 @@ def remap_to_used(self, inds):
         new = match.argmax(-1)
         unknown = match.sum(2) < 1
         if self.unknown_index == "random":
-            new[unknown] = paddle.randint(
-                0, self.re_embed, shape=new[unknown].shape)
+            new[unknown] = paddle.randint(0, self.re_embed, shape=new[unknown].shape)
         else:
             new[unknown] = self.unknown_index
         return new.reshape(ishape)
@@ -335,8 +328,7 @@ def unmap_to_all(self, inds):
         used = self.used.cast(inds.dtype)
         if self.re_embed > self.used.shape[0]:  # extra token
             inds[inds >= self.used.shape[0]] = 0  # simply set to zero
-        back = paddle.take_along_axis(
-            used[None, :][inds.shape[0] * [0], :], inds, axis=1)
+        back = paddle.take_along_axis(used[None, :][inds.shape[0] * [0], :], inds, axis=1)
         return back.reshape(ishape)
 
     def forward(self, z):
@@ -345,9 +337,11 @@ def forward(self, z):
         z_flattened = z.reshape([-1, self.vq_embed_dim])
         # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
 
-        d = (paddle.sum(z_flattened**2, axis=1, keepdim=True) + paddle.sum(
-            self.embedding.weight**2, axis=1) - 2 * paddle.matmul(
-                z_flattened, self.embedding.weight, transpose_y=True))
+        d = (
+            paddle.sum(z_flattened**2, axis=1, keepdim=True)
+            + paddle.sum(self.embedding.weight**2, axis=1)
+            - 2 * paddle.matmul(z_flattened, self.embedding.weight, transpose_y=True)
+        )
 
         min_encoding_indices = paddle.argmin(d, axis=1)
         z_q = self.embedding(min_encoding_indices).reshape(z.shape)
@@ -356,11 +350,9 @@ def forward(self, z):
 
         # compute loss for embedding
         if not self.legacy:
-            loss = self.beta * paddle.mean((z_q.detach() - z)**2) + paddle.mean(
-                (z_q - z.detach())**2)
+            loss = self.beta * paddle.mean((z_q.detach() - z) ** 2) + paddle.mean((z_q - z.detach()) ** 2)
         else:
-            loss = paddle.mean((z_q.detach() - z)**2) + self.beta * paddle.mean(
-                (z_q - z.detach())**2)
+            loss = paddle.mean((z_q.detach() - z) ** 2) + self.beta * paddle.mean((z_q - z.detach()) ** 2)
 
         # preserve gradients
         z_q = z + (z_q - z).detach()
@@ -369,15 +361,12 @@ def forward(self, z):
         z_q = z_q.transpose([0, 3, 1, 2])
 
         if self.remap is not None:
-            min_encoding_indices = min_encoding_indices.reshape(
-                [z.shape[0], -1])  # add batch axis
+            min_encoding_indices = min_encoding_indices.reshape([z.shape[0], -1])  # add batch axis
             min_encoding_indices = self.remap_to_used(min_encoding_indices)
-            min_encoding_indices = min_encoding_indices.reshape(
-                [-1, 1])  # flatten
+            min_encoding_indices = min_encoding_indices.reshape([-1, 1])  # flatten
 
         if self.sane_index_shape:
-            min_encoding_indices = min_encoding_indices.reshape(
-                [z_q.shape[0], z_q.shape[2], z_q.shape[3]])
+            min_encoding_indices = min_encoding_indices.reshape([z_q.shape[0], z_q.shape[2], z_q.shape[3]])
 
         return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
 
@@ -386,7 +375,11 @@ def get_codebook_entry(self, indices, shape):
         if self.remap is not None:
             indices = indices.reshape([shape[0], -1])  # add batch axis
             indices = self.unmap_to_all(indices)
-            indices = indices.reshape([-1, ])  # flatten again
+            indices = indices.reshape(
+                [
+                    -1,
+                ]
+            )  # flatten again
 
         # get quantized latent vectors
         z_q = self.embedding(indices)
@@ -408,14 +401,11 @@ def __init__(self, parameters, deterministic=False):
         self.std = paddle.exp(0.5 * self.logvar)
         self.var = paddle.exp(self.logvar)
         if self.deterministic:
-            self.var = self.std = paddle.zeros_like(
-                self.mean, dtype=self.parameters.dtype)
+            self.var = self.std = paddle.zeros_like(self.mean, dtype=self.parameters.dtype)
 
-    def sample(self,
-               generator: Optional[paddle.Generator]=None) -> paddle.Tensor:
+    def sample(self, generator: Optional[paddle.Generator] = None) -> paddle.Tensor:
         # make sure sample is on the same device as the parameters and has same dtype
-        sample = randn_tensor(
-            self.mean.shape, generator=generator, dtype=self.parameters.dtype)
+        sample = randn_tensor(self.mean.shape, generator=generator, dtype=self.parameters.dtype)
         x = self.mean + self.std * sample
         return x
 
@@ -426,21 +416,26 @@ def kl(self, other=None):
             if other is None:
                 return 0.5 * paddle.sum(
                     paddle.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
-                    axis=[1, 2, 3], )
+                    axis=[1, 2, 3],
+                )
             else:
                 return 0.5 * paddle.sum(
-                    paddle.pow(self.mean - other.mean, 2) / other.var + self.var
-                    / other.var - 1.0 - self.logvar + other.logvar,
-                    axis=[1, 2, 3], )
+                    paddle.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    axis=[1, 2, 3],
+                )
 
     def nll(self, sample, axis=[1, 2, 3]):
         if self.deterministic:
             return paddle.to_tensor([0.0])
         logtwopi = np.log(2.0 * np.pi)
         return 0.5 * paddle.sum(
-            logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) /
-            self.var,
-            axis=axis, )
+            logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) / self.var,
+            axis=axis,
+        )
 
     def mode(self):
         return self.mean
diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py
index 87a07653649cd..8104816e90486 100644
--- a/ppdiffusers/ppdiffusers/models/vq_model.py
+++ b/ppdiffusers/ppdiffusers/models/vq_model.py
@@ -69,20 +69,21 @@ class VQModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            in_channels: int=3,
-            out_channels: int=3,
-            down_block_types: Tuple[str]=("DownEncoderBlock2D", ),
-            up_block_types: Tuple[str]=("UpDecoderBlock2D", ),
-            block_out_channels: Tuple[int]=(64, ),
-            layers_per_block: int=1,
-            act_fn: str="silu",
-            latent_channels: int=3,
-            sample_size: int=32,
-            num_vq_embeddings: int=256,
-            norm_num_groups: int=32,
-            vq_embed_dim: Optional[int]=None,
-            scaling_factor: float=0.18215, ):
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 3,
+        sample_size: int = 32,
+        num_vq_embeddings: int = 256,
+        norm_num_groups: int = 32,
+        vq_embed_dim: Optional[int] = None,
+        scaling_factor: float = 0.18215,
+    ):
         super().__init__()
 
         # pass init params to Encoder
@@ -94,7 +95,8 @@ def __init__(
             layers_per_block=layers_per_block,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
-            double_z=False, )
+            double_z=False,
+        )
 
         vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
 
@@ -104,7 +106,8 @@ def __init__(
             vq_embed_dim,
             beta=0.25,
             remap=None,
-            sane_index_shape=False, )
+            sane_index_shape=False,
+        )
         self.post_quant_conv = nn.Conv2D(vq_embed_dim, latent_channels, 1)
 
         # pass init params to Decoder
@@ -115,22 +118,24 @@ def __init__(
             block_out_channels=block_out_channels,
             layers_per_block=layers_per_block,
             act_fn=act_fn,
-            norm_num_groups=norm_num_groups, )
+            norm_num_groups=norm_num_groups,
+        )
 
-    def encode(self, x: paddle.Tensor, return_dict: bool=True):
+    def encode(self, x: paddle.Tensor, return_dict: bool = True):
         h = self.encoder(x)
         h = self.quant_conv(h)
 
         if not return_dict:
-            return (h, )
+            return (h,)
 
         return VQEncoderOutput(latents=h)
 
     def decode(
-            self,
-            h: paddle.Tensor,
-            force_not_quantize: bool=False,
-            return_dict: bool=True, ):
+        self,
+        h: paddle.Tensor,
+        force_not_quantize: bool = False,
+        return_dict: bool = True,
+    ):
         # cast h to float16 / float32
         h = h.cast(self.dtype)
         # also go through quantization layer
@@ -142,11 +147,11 @@ def decode(
         dec = self.decoder(quant)
 
         if not return_dict:
-            return (dec, )
+            return (dec,)
 
         return DecoderOutput(sample=dec)
 
-    def forward(self, sample: paddle.Tensor, return_dict: bool=True):
+    def forward(self, sample: paddle.Tensor, return_dict: bool = True):
         r"""
         Args:
             sample (`paddle.Tensor`): Input sample.
@@ -158,6 +163,6 @@ def forward(self, sample: paddle.Tensor, return_dict: bool=True):
         dec = self.decode(h).sample
 
         if not return_dict:
-            return (dec, )
+            return (dec,)
 
         return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/optimization.py b/ppdiffusers/ppdiffusers/optimization.py
index 738ef9f4d113f..d6c5efafaed3f 100644
--- a/ppdiffusers/ppdiffusers/optimization.py
+++ b/ppdiffusers/ppdiffusers/optimization.py
@@ -34,7 +34,7 @@ class SchedulerType(Enum):
     CONSTANT_WITH_WARMUP = "constant_with_warmup"
 
 
-def get_constant_schedule(learning_rate: float, last_epoch: int=-1):
+def get_constant_schedule(learning_rate: float, last_epoch: int = -1):
     """
     Create a schedule with a constant learning rate, using the learning rate set in optimizer.
 
@@ -50,9 +50,7 @@ def get_constant_schedule(learning_rate: float, last_epoch: int=-1):
     return LambdaDecay(learning_rate, lambda _: 1, last_epoch=last_epoch)
 
 
-def get_constant_schedule_with_warmup(learning_rate: float,
-                                      num_warmup_steps: int,
-                                      last_epoch: int=-1):
+def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_steps: int, last_epoch: int = -1):
     """
     Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
     increases linearly between 0 and the initial lr set in the optimizer.
@@ -78,10 +76,11 @@ def lr_lambda(current_step: int):
 
 
 def get_linear_schedule_with_warmup(
-        learning_rate: float,
-        num_warmup_steps: int,
-        num_training_steps: int,
-        last_epoch: int=-1, ):
+    learning_rate: float,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    last_epoch: int = -1,
+):
     """
     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
     a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
@@ -105,18 +104,19 @@ def lr_lambda(current_step: int):
             return float(current_step) / float(max(1, num_warmup_steps))
         return max(
             0.0,
-            float(num_training_steps - current_step) /
-            float(max(1, num_training_steps - num_warmup_steps)), )
+            float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)),
+        )
 
     return LambdaDecay(learning_rate, lr_lambda, last_epoch)
 
 
 def get_cosine_schedule_with_warmup(
-        learning_rate: float,
-        num_warmup_steps: int,
-        num_training_steps: int,
-        num_cycles: float=0.5,
-        last_epoch: int=-1, ):
+    learning_rate: float,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+):
     """
     Create a schedule with a learning rate that decreases following the values of the cosine function between the
     initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
@@ -142,21 +142,19 @@ def get_cosine_schedule_with_warmup(
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(
-            max(1, num_training_steps - num_warmup_steps))
-        return max(
-            0.0, 0.5 *
-            (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
 
     return LambdaDecay(learning_rate, lr_lambda, last_epoch)
 
 
 def get_cosine_with_hard_restarts_schedule_with_warmup(
-        learning_rate: float,
-        num_warmup_steps: int,
-        num_training_steps: int,
-        num_cycles: int=1,
-        last_epoch: int=-1, ):
+    learning_rate: float,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: int = 1,
+    last_epoch: int = -1,
+):
     """
     Create a schedule with a learning rate that decreases following the values of the cosine function between the
     initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
@@ -181,25 +179,25 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(
     def lr_lambda(current_step):
         if current_step < num_warmup_steps:
             return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(
-            max(1, num_training_steps - num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
         if progress >= 1.0:
             return 0.0
         return max(
             0.0,
-            0.5 * (1.0 + math.cos(math.pi * (
-                (float(num_cycles) * progress) % 1.0))), )
+            0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))),
+        )
 
     return LambdaDecay(learning_rate, lr_lambda, last_epoch)
 
 
 def get_polynomial_decay_schedule_with_warmup(
-        learning_rate: float,
-        num_warmup_steps: int,
-        num_training_steps: int,
-        lr_end: float=1e-7,
-        power: float=1.0,
-        last_epoch: int=-1, ):
+    learning_rate: float,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    lr_end: float = 1e-7,
+    power: float = 1.0,
+    last_epoch: int = -1,
+):
     """
     Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
     optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
@@ -230,8 +228,7 @@ def get_polynomial_decay_schedule_with_warmup(
 
     lr_init = learning_rate
     if not (lr_init > lr_end):
-        raise ValueError(
-            f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
 
     def lr_lambda(current_step: int):
         if current_step < num_warmup_steps:
@@ -251,8 +248,7 @@ def lr_lambda(current_step: int):
 TYPE_TO_SCHEDULER_FUNCTION = {
     SchedulerType.LINEAR: get_linear_schedule_with_warmup,
     SchedulerType.COSINE: get_cosine_schedule_with_warmup,
-    SchedulerType.COSINE_WITH_RESTARTS:
-    get_cosine_with_hard_restarts_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
     SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
     SchedulerType.CONSTANT: get_constant_schedule,
     SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
@@ -260,13 +256,14 @@ def lr_lambda(current_step: int):
 
 
 def get_scheduler(
-        name: Union[str, SchedulerType],
-        learning_rate: float=0.1,
-        num_warmup_steps: Optional[int]=None,
-        num_training_steps: Optional[int]=None,
-        num_cycles: int=1,
-        power: float=1.0,
-        last_epoch: int=-1, ):
+    name: Union[str, SchedulerType],
+    learning_rate: float = 0.1,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    num_cycles: int = 1,
+    power: float = 1.0,
+    last_epoch: int = -1,
+):
     """
     Unified API to get any scheduler from its name.
 
@@ -295,20 +292,18 @@ def get_scheduler(
 
     # All other schedulers require `num_warmup_steps`
     if num_warmup_steps is None:
-        raise ValueError(
-            f"{name} requires `num_warmup_steps`, please provide that argument.")
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
 
     if name == SchedulerType.CONSTANT_WITH_WARMUP:
         return schedule_func(
             learning_rate=learning_rate,
             num_warmup_steps=num_warmup_steps,
-            last_epoch=last_epoch, )
+            last_epoch=last_epoch,
+        )
 
     # All other schedulers require `num_training_steps`
     if num_training_steps is None:
-        raise ValueError(
-            f"{name} requires `num_training_steps`, please provide that argument."
-        )
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
 
     if name == SchedulerType.COSINE_WITH_RESTARTS:
         return schedule_func(
@@ -316,7 +311,8 @@ def get_scheduler(
             num_warmup_steps=num_warmup_steps,
             num_training_steps=num_training_steps,
             num_cycles=num_cycles,
-            last_epoch=last_epoch, )
+            last_epoch=last_epoch,
+        )
 
     if name == SchedulerType.POLYNOMIAL:
         return schedule_func(
@@ -324,10 +320,12 @@ def get_scheduler(
             num_warmup_steps=num_warmup_steps,
             num_training_steps=num_training_steps,
             power=power,
-            last_epoch=last_epoch, )
+            last_epoch=last_epoch,
+        )
 
     return schedule_func(
         learning_rate=learning_rate,
         num_warmup_steps=num_warmup_steps,
         num_training_steps=num_training_steps,
-        last_epoch=last_epoch, )
+        last_epoch=last_epoch,
+    )
diff --git a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
index 24a72c1aa650d..74f6bdbb6b2b6 100644
--- a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
+++ b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
@@ -25,11 +25,25 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 from ..utils import (
-    DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB, HF_HUB_OFFLINE,
-    LOW_CPU_MEM_USAGE_DEFAULT, PPDIFFUSERS_CACHE, TO_DIFFUSERS, _add_variant,
-    _get_model_file, get_logger, is_paddle_available, is_paddlenlp_available,
-    is_ppxformers_available, is_safetensors_available, is_torch_available,
-    is_torch_file, smart_load, str2bool)
+    DIFFUSERS_CACHE,
+    FROM_DIFFUSERS,
+    FROM_HF_HUB,
+    HF_HUB_OFFLINE,
+    LOW_CPU_MEM_USAGE_DEFAULT,
+    PPDIFFUSERS_CACHE,
+    TO_DIFFUSERS,
+    _add_variant,
+    _get_model_file,
+    get_logger,
+    is_paddle_available,
+    is_paddlenlp_available,
+    is_ppxformers_available,
+    is_safetensors_available,
+    is_torch_available,
+    is_torch_file,
+    smart_load,
+    str2bool,
+)
 
 logger = get_logger(__name__)
 
@@ -60,8 +74,7 @@ def copy_func(f):
     "Copy a non-builtin function (NB `copy.copy` does not work for this)"
     if not isinstance(f, FunctionType):
         return copy.copy(f)
-    fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__,
-                      f.__closure__)
+    fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__)
     fn.__kwdefaults__ = f.__kwdefaults__
     fn.__dict__.update(f.__dict__)
     fn.__annotations__.update(f.__annotations__)
@@ -81,7 +94,7 @@ def __get__(self, _, f_cls):
 def patch_to(cls, as_prop=False, cls_method=False):
     "Decorator: add `f` to `cls`"
     if not isinstance(cls, (tuple, list)):
-        cls = (cls, )
+        cls = (cls,)
 
     def _inner(f):
         for c_ in cls:
@@ -108,11 +121,11 @@ def _inner(f):
 
     def is_floating_point(x):
         if not isinstance(x, (paddle.Tensor, paddle.static.Variable)):
-            raise TypeError(
-                "Expected Tensor, but received type of x: {}".format(type(x)))
+            raise TypeError("Expected Tensor, but received type of x: {}".format(type(x)))
         dtype = x.dtype
-        is_fp_dtype = (dtype == paddle.float32 or dtype == paddle.float64 or
-                       dtype == paddle.float16 or dtype == paddle.bfloat16)
+        is_fp_dtype = (
+            dtype == paddle.float32 or dtype == paddle.float64 or dtype == paddle.float16 or dtype == paddle.bfloat16
+        )
         return is_fp_dtype
 
     if not hasattr(paddle, "is_floating_point"):
@@ -219,7 +232,8 @@ def Parameter(data: paddle.Tensor, requires_grad=True):
         tensor = paddle.create_parameter(
             data.shape,
             dtype=data.dtype,
-            default_initializer=nn.initializer.Assign(data), )
+            default_initializer=nn.initializer.Assign(data),
+        )
         if not requires_grad:
             tensor.stop_gradient = True
         return tensor
@@ -247,8 +261,7 @@ def get_sublayer(self, target: str):
 
         for item in atoms:
             if not hasattr(mod, item):
-                raise AttributeError(mod.__class__.__name__ + " has no "
-                                     "attribute `" + item + "`")
+                raise AttributeError(mod.__class__.__name__ + " has no " "attribute `" + item + "`")
 
             mod = getattr(mod, item)
 
@@ -259,23 +272,21 @@ def get_sublayer(self, target: str):
     nn.Layer.get_sublayer = get_sublayer
 
     class _WrappedHook:
-        def __init__(self, hook: Callable, module: Optional["nn.Layer"]=None):
+        def __init__(self, hook: Callable, module: Optional["nn.Layer"] = None):
             self.hook: Callable = hook
             functools.update_wrapper(self, hook)
 
             self.with_module: bool = False
 
             if module is not None:
-                self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(
-                    module)
+                self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(module)
                 self.with_module = True
 
         def __call__(self, *args: Any, **kwargs: Any) -> Any:
             if self.with_module:
                 module = self.module()
                 if module is None:
-                    raise RuntimeError(
-                        "You are trying to call the hook of a dead Module!")
+                    raise RuntimeError("You are trying to call the hook of a dead Module!")
                 return self.hook(module, *args, **kwargs)
             return self.hook(*args, **kwargs)
 
@@ -292,8 +303,7 @@ def __setstate__(self, state: Dict):
 
             if self.with_module:
                 if state["module"] is None:
-                    raise RuntimeError(
-                        "You are trying to revive the hook of a dead Module!")
+                    raise RuntimeError("You are trying to revive the hook of a dead Module!")
                 self.module = weakref.ref(state["module"])
 
     try:
@@ -305,22 +315,20 @@ def register_load_state_dict_pre_hook(self, hook, with_module=False):
         if not hasattr(self, "load_state_dict_pre_hooks"):
             self.load_state_dict_pre_hooks = OrderedDict()
         handle = HookRemoveHelper(self.load_state_dict_pre_hooks)
-        self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(
-            hook, self if with_module else None)
+        self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(hook, self if with_module else None)
         return handle
 
     nn.Layer.register_load_state_dict_pre_hook = register_load_state_dict_pre_hook
 
     raw_set_state_dict = nn.Layer.set_state_dict
 
-    def set_state_dict(self, state_dict, use_structured_name: bool=True):
+    def set_state_dict(self, state_dict, use_structured_name: bool = True):
         if hasattr(self, "load_state_dict_pre_hooks"):
             for hook in self.load_state_dict_pre_hooks.values():
                 hook(state_dict)
         # POP is_torch_weight
         state_dict.pop("is_torch_weight", None)
-        return raw_set_state_dict(
-            self, state_dict, use_structured_name=use_structured_name)
+        return raw_set_state_dict(self, state_dict, use_structured_name=use_structured_name)
 
     nn.Layer.set_state_dict = set_state_dict
     nn.Layer.load_dict = nn.Layer.set_state_dict
@@ -338,12 +346,12 @@ def set_state_dict(self, state_dict, use_structured_name: bool=True):
         from ..utils.paddle_utils import no_init_weights
 
     if is_ppxformers_available():
-        from paddle.incubate.nn.memory_efficient_attention import \
-            memory_efficient_attention
+        from paddle.incubate.nn.memory_efficient_attention import (
+            memory_efficient_attention,
+        )
         from paddle.nn.functional.flash_attention import flash_attention
 
-        sdp_kernel = paddle.nn.functional.flash_attention._select_sdp_cuda(128 +
-                                                                           64)
+        sdp_kernel = paddle.nn.functional.flash_attention._select_sdp_cuda(128 + 64)
         if sdp_kernel == "mem_efficient":
             flash_attn_version = 1
         else:
@@ -353,33 +361,32 @@ def set_state_dict(self, state_dict, use_structured_name: bool=True):
         flash_attn_error = None
         try:
             _ = flash_attention(
-                paddle.ones(
-                    (1, 1, 2, 40), dtype=paddle.float16),
-                paddle.ones(
-                    (1, 1, 2, 40), dtype=paddle.float16),
-                paddle.ones(
-                    (1, 1, 2, 40), dtype=paddle.float16), )
+                paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+            )
         except Exception as error:
             flash_attn_error = error
             is_support_flash_attention = False
 
         def scaled_dot_product_attention_(
-                query,
-                key,
-                value,
-                attn_mask=None,
-                dropout_p=0.0,
-                is_causal=False,
-                scale=None,
-                training=True,
-                attention_op=None, ):
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=False,
+            scale=None,
+            training=True,
+            attention_op=None,
+        ):
 
             if attention_op in [None, "auto"]:
                 head_dim = query.shape[-1]
                 attention_op = "cutlass"
                 if is_support_flash_attention and query.dtype in [
-                        paddle.float16,
-                        paddle.bfloat16,
+                    paddle.float16,
+                    paddle.bfloat16,
                 ]:
                     if flash_attn_version == 1:
                         if head_dim <= 128:
@@ -403,17 +410,12 @@ def scaled_dot_product_attention_(
                 else:
                     if attn_mask is not None:
                         attn_mask = paddle.transpose(attn_mask, [0, 2, 1, 3])
-                        if (attn_mask.cast("float32").min() == 0 and
-                                attn_mask.cast("float32").max() == 1):
+                        if attn_mask.cast("float32").min() == 0 and attn_mask.cast("float32").max() == 1:
                             attn_mask = (attn_mask.cast(s.dtype) - 1) * 10000.0
                         s = s + attn_mask
                     p = paddle.nn.functional.softmax(s, axis=-1)
                 if dropout_p > 0.0:
-                    p = paddle.nn.functional.dropout(
-                        p,
-                        dropout_p,
-                        training=training,
-                        mode="upscale_in_train")
+                    p = paddle.nn.functional.dropout(p, dropout_p, training=training, mode="upscale_in_train")
                 o = paddle.matmul(p, vt)
                 return paddle.transpose(o, [0, 2, 1, 3])
             elif attention_op == "cutlass":
@@ -427,7 +429,8 @@ def scaled_dot_product_attention_(
                     None,
                     p=dropout_p if training else 0.0,
                     scale=scale,
-                    training=True, )  # make sure we use training=True
+                    training=True,
+                )  # make sure we use training=True
             elif attention_op == "flash":
                 output = flash_attention(
                     query,
@@ -435,15 +438,13 @@ def scaled_dot_product_attention_(
                     value,
                     dropout=dropout_p,
                     causal=is_causal,
-                    return_softmax=False, )[0]
+                    return_softmax=False,
+                )[0]
             else:
-                raise ValueError(
-                    "ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']"
-                )
+                raise ValueError("ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']")
             return output
 
-        paddle.nn.functional.scaled_dot_product_attention_ = (
-            scaled_dot_product_attention_)
+        paddle.nn.functional.scaled_dot_product_attention_ = scaled_dot_product_attention_
 
     @patch_to(nn.Layer, as_prop=True)
     def dtype(parameter: nn.Layer) -> paddle.dtype:
@@ -474,8 +475,10 @@ def device(self):
         from shutil import copyfile
 
         import sentencepiece as spm
-        from paddlenlp.transformers.tokenizer_utils import (AddedToken,
-                                                            PretrainedTokenizer)
+        from paddlenlp.transformers.tokenizer_utils import (
+            AddedToken,
+            PretrainedTokenizer,
+        )
 
         SPIECE_UNDERLINE = "▁"
 
@@ -495,24 +498,24 @@ class XLMRobertaTokenizer(PretrainedTokenizer):
             model_input_names = ["input_ids", "attention_mask"]
 
             def __init__(
-                    self,
-                    vocab_file,
-                    bos_token="<s>",
-                    eos_token="</s>",
-                    sep_token="</s>",
-                    cls_token="<s>",
-                    unk_token="<unk>",
-                    pad_token="<pad>",
-                    mask_token="<mask>",
-                    sp_model_kwargs: Optional[Dict[str, Any]]=None,
-                    **kwargs, ) -> None:
+                self,
+                vocab_file,
+                bos_token="<s>",
+                eos_token="</s>",
+                sep_token="</s>",
+                cls_token="<s>",
+                unk_token="<unk>",
+                pad_token="<pad>",
+                mask_token="<mask>",
+                sp_model_kwargs: Optional[Dict[str, Any]] = None,
+                **kwargs,
+            ) -> None:
                 # Mask token behave like a normal word, i.e. include the space before it
-                mask_token = (AddedToken(
-                    mask_token, lstrip=True, rstrip=False)
-                              if isinstance(mask_token, str) else mask_token)
+                mask_token = (
+                    AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+                )
 
-                self.sp_model_kwargs = ({} if sp_model_kwargs is None else
-                                        sp_model_kwargs)
+                self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
                 super().__init__(
                     bos_token=bos_token,
@@ -523,10 +526,10 @@ def __init__(
                     pad_token=pad_token,
                     mask_token=mask_token,
                     sp_model_kwargs=self.sp_model_kwargs,
-                    **kwargs, )
+                    **kwargs,
+                )
 
-                self.sp_model = spm.SentencePieceProcessor(
-                    **self.sp_model_kwargs)
+                self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
                 self.sp_model.Load(str(vocab_file))
                 self.vocab_file = vocab_file
 
@@ -547,12 +550,8 @@ def __init__(
                 # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
                 self.fairseq_offset = 1
 
-                self.fairseq_tokens_to_ids["<mask>"] = (
-                    len(self.sp_model) + self.fairseq_offset)
-                self.fairseq_ids_to_tokens = {
-                    v: k
-                    for k, v in self.fairseq_tokens_to_ids.items()
-                }
+                self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
+                self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
             def __getstate__(self):
                 state = self.__dict__.copy()
@@ -567,14 +566,12 @@ def __setstate__(self, d):
                 if not hasattr(self, "sp_model_kwargs"):
                     self.sp_model_kwargs = {}
 
-                self.sp_model = spm.SentencePieceProcessor(
-                    **self.sp_model_kwargs)
+                self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
                 self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
 
             def build_inputs_with_special_tokens(
-                    self,
-                    token_ids_0: List[int],
-                    token_ids_1: Optional[List[int]]=None) -> List[int]:
+                self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+            ) -> List[int]:
                 """
                 Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
                 adding special tokens. An XLM-RoBERTa sequence has the following format:
@@ -590,17 +587,17 @@ def build_inputs_with_special_tokens(
                 """
 
                 if token_ids_1 is None:
-                    return [self.cls_token_id
-                            ] + token_ids_0 + [self.sep_token_id]
+                    return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
                 cls = [self.cls_token_id]
                 sep = [self.sep_token_id]
                 return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
             def get_special_tokens_mask(
-                    self,
-                    token_ids_0: List[int],
-                    token_ids_1: Optional[List[int]]=None,
-                    already_has_special_tokens: bool=False, ) -> List[int]:
+                self,
+                token_ids_0: List[int],
+                token_ids_1: Optional[List[int]] = None,
+                already_has_special_tokens: bool = False,
+            ) -> List[int]:
                 """
                 Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
                 special tokens using the tokenizer `prepare_for_model` method.
@@ -619,17 +616,16 @@ def get_special_tokens_mask(
                     return super().get_special_tokens_mask(
                         token_ids_0=token_ids_0,
                         token_ids_1=token_ids_1,
-                        already_has_special_tokens=True, )
+                        already_has_special_tokens=True,
+                    )
 
                 if token_ids_1 is None:
                     return [1] + ([0] * len(token_ids_0)) + [1]
-                return ([1] + ([0] * len(token_ids_0)) + [1, 1] +
-                        ([0] * len(token_ids_1)) + [1])
+                return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 
             def create_token_type_ids_from_sequences(
-                    self,
-                    token_ids_0: List[int],
-                    token_ids_1: Optional[List[int]]=None) -> List[int]:
+                self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+            ) -> List[int]:
                 """
                 Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
                 not make use of token type ids, therefore a list of zeros is returned.
@@ -647,19 +643,14 @@ def create_token_type_ids_from_sequences(
 
                 if token_ids_1 is None:
                     return len(cls + token_ids_0 + sep) * [0]
-                return len(cls + token_ids_0 + sep + sep + token_ids_1 +
-                           sep) * [0]
+                return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
 
             @property
             def vocab_size(self):
-                return (len(self.sp_model) + self.fairseq_offset + 1
-                        )  # Add the <mask> token
+                return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
 
             def get_vocab(self):
-                vocab = {
-                    self.convert_ids_to_tokens(i): i
-                    for i in range(self.vocab_size)
-                }
+                vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
                 vocab.update(self.added_tokens_encoder)
                 return vocab
 
@@ -683,33 +674,28 @@ def _convert_id_to_token(self, index):
 
             def convert_tokens_to_string(self, tokens):
                 """Converts a sequence of tokens (strings for sub-words) in a single string."""
-                out_string = "".join(tokens).replace(SPIECE_UNDERLINE,
-                                                     " ").strip()
+                out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
                 return out_string
 
-            def save_vocabulary(
-                    self, save_directory: str,
-                    filename_prefix: Optional[str]=None) -> Tuple[str]:
+            def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
                 if not os.path.isdir(save_directory):
-                    logger.error(
-                        f"Vocabulary path ({save_directory}) should be a directory"
-                    )
+                    logger.error(f"Vocabulary path ({save_directory}) should be a directory")
                     return
                 out_vocab_file = os.path.join(
                     save_directory,
-                    (filename_prefix + "-" if filename_prefix else "") +
-                    self.resource_files_names["vocab_file"], )
+                    (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
+                )
 
-                if os.path.abspath(self.vocab_file) != os.path.abspath(
-                        out_vocab_file) and os.path.isfile(self.vocab_file):
+                if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
+                    self.vocab_file
+                ):
                     copyfile(self.vocab_file, out_vocab_file)
                 elif not os.path.isfile(self.vocab_file):
                     with open(out_vocab_file, "wb") as fi:
-                        content_spiece_model = self.sp_model.serialized_model_proto(
-                        )
+                        content_spiece_model = self.sp_model.serialized_model_proto()
                         fi.write(content_spiece_model)
 
-                return (out_vocab_file, )
+                return (out_vocab_file,)
 
         paddlenlp.transformers.XLMRobertaTokenizer = XLMRobertaTokenizer
 
@@ -719,16 +705,17 @@ def save_vocabulary(
     BertModel.raw_forward = BertModel.forward
 
     def forward_new(
-            self,
-            input_ids: paddle.Tensor,
-            token_type_ids: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]]=None,
-            use_cache: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            output_attentions: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ):
+        self,
+        input_ids: paddle.Tensor,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
         if attention_mask is None:
             attention_mask = paddle.ones_like(input_ids)
         return self.raw_forward(
@@ -740,7 +727,8 @@ def forward_new(
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
     BertModel.forward = forward_new
 
@@ -748,13 +736,10 @@ def forward_new(
     TRANSFORMERS_WEIGHTS_NAME = "pytorch_model.bin"
 
     # patch from_pretrained and save_pretrained
-    def from_pretrained_v3(cls,
-                           pretrained_model_name_or_path,
-                           *args,
-                           from_hf_hub: bool=False,
-                           **kwargs):
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+    def from_pretrained_v3(cls, pretrained_model_name_or_path, *args, from_hf_hub: bool = False, **kwargs):
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
         force_download = kwargs.pop("force_download", False)
         from_diffusers = kwargs.pop("from_diffusers", None)
@@ -773,8 +758,7 @@ def from_pretrained_v3(cls,
             paddle_dtype = _dtype
         subfolder = kwargs.pop("subfolder", None)
         variant = kwargs.pop("variant", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage",
-                                       LOW_CPU_MEM_USAGE_DEFAULT)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
 
         user_agent = {
             "ppdiffusers": __version__,
@@ -787,8 +771,7 @@ def from_pretrained_v3(cls,
         model_kwargs = kwargs
         # 1. get the PretrainedConfig to init model
         if not isinstance(config, PretrainedConfig):
-            config_path = (config if config is not None else
-                           pretrained_model_name_or_path)
+            config_path = config if config is not None else pretrained_model_name_or_path
 
             # TODO fix config  from_pretrained
             # must from hf hub
@@ -797,9 +780,11 @@ def from_pretrained_v3(cls,
                     kwargs["subfolder"] = subfolder
             else:
                 if subfolder is not None:
-                    config_path = (os.path.join(config_path, subfolder)
-                                   if os.path.isdir(config_path) else
-                                   "/".join([config_path, subfolder]))
+                    config_path = (
+                        os.path.join(config_path, subfolder)
+                        if os.path.isdir(config_path)
+                        else "/".join([config_path, subfolder])
+                    )
 
             config, model_kwargs = cls.config_class.from_pretrained(
                 config_path,
@@ -807,12 +792,12 @@ def from_pretrained_v3(cls,
                 return_unused_kwargs=True,
                 force_download=force_download,
                 from_hf_hub=from_hf_hub,
-                **kwargs, )
+                **kwargs,
+            )
         assert config is not None
 
         # we will remove in the future.
-        if not from_hf_hub and not os.path.exists(
-                os.path.join(cache_dir, config_path, "config.json")):
+        if not from_hf_hub and not os.path.exists(os.path.join(cache_dir, config_path, "config.json")):
             config.save_pretrained(os.path.join(cache_dir, config_path))
 
         if paddle_dtype is None:
@@ -825,8 +810,7 @@ def from_pretrained_v3(cls,
                 try:
                     model_file = _get_model_file(
                         pretrained_model_name_or_path,
-                        weights_name=_add_variant(
-                            TRANSFORMERS_SAFE_WEIGHTS_NAME, variant),
+                        weights_name=_add_variant(TRANSFORMERS_SAFE_WEIGHTS_NAME, variant),
                         cache_dir=cache_dir,
                         force_download=force_download,
                         resume_download=resume_download,
@@ -836,15 +820,15 @@ def from_pretrained_v3(cls,
                         revision=revision,
                         subfolder=subfolder,
                         user_agent=user_agent,
-                        from_hf_hub=from_hf_hub, )
+                        from_hf_hub=from_hf_hub,
+                    )
                 except Exception:  # noqa: E722
                     model_file = None
                     pass
             if model_file is None:
                 model_file = _get_model_file(
                     pretrained_model_name_or_path,
-                    weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME,
-                                              variant),
+                    weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME, variant),
                     cache_dir=cache_dir,
                     force_download=force_download,
                     resume_download=resume_download,
@@ -854,7 +838,8 @@ def from_pretrained_v3(cls,
                     revision=revision,
                     subfolder=subfolder,
                     user_agent=user_agent,
-                    from_hf_hub=from_hf_hub, )
+                    from_hf_hub=from_hf_hub,
+                )
         else:
             model_file = _get_model_file(
                 pretrained_model_name_or_path,
@@ -868,19 +853,20 @@ def from_pretrained_v3(cls,
                 revision=revision,
                 subfolder=subfolder,
                 user_agent=user_agent,
-                from_hf_hub=from_hf_hub, )
+                from_hf_hub=from_hf_hub,
+            )
         assert model_file is not None
 
         # try load model_file with paddle / torch / safetensor
         state_dict = smart_load(model_file)
         init_contexts = []
 
-        dtype = set(v.dtype for v in state_dict.values()
-                    if paddle.is_tensor(v) and paddle.is_floating_point(v))
+        dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v))
         if len(dtype) > 1 and paddle.float32 not in dtype:
             raise ValueError(
                 f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please"
-                f" make sure that {model_file} weights have only one dtype.")
+                f" make sure that {model_file} weights have only one dtype."
+            )
         elif len(dtype) > 1 and paddle.float32 in dtype:
             dtype = paddle.float32
         elif len(dtype) == 0:
@@ -900,22 +886,18 @@ def from_pretrained_v3(cls,
             model = cls(config, **model_kwargs)
 
         # convert weights
-        if (from_diffusers or is_torch_file(model_file)) and hasattr(
-                cls, "smart_convert"):
+        if (from_diffusers or is_torch_file(model_file)) and hasattr(cls, "smart_convert"):
             state_dict = cls.smart_convert(state_dict, model)
 
         loaded_state_dict_keys = list(state_dict.keys())
 
-        (
-            model,
-            missing_keys,
-            unexpected_keys,
-            mismatched_keys, ) = cls._load_pretrained_model_old(
-                model=model,
-                state_dict=state_dict,
-                loaded_keys=loaded_state_dict_keys,
-                ignore_mismatched_sizes=ignore_mismatched_sizes,
-                dtype=None, )
+        (model, missing_keys, unexpected_keys, mismatched_keys,) = cls._load_pretrained_model_old(
+            model=model,
+            state_dict=state_dict,
+            loaded_keys=loaded_state_dict_keys,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            dtype=None,
+        )
         loading_info = {
             "missing_keys": missing_keys,
             "unexpected_keys": unexpected_keys,
@@ -941,9 +923,7 @@ def from_pretrained_v3(cls,
                 " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
             )
         else:
-            logger.info(
-                f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
-            )
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
 
         if len(missing_keys) > 0:
             logger.warning(
@@ -956,17 +936,21 @@ def from_pretrained_v3(cls,
                 f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
                 f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
                 f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
-                " training.")
+                " training."
+            )
         if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join([
-                f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                for key, shape1, shape2 in mismatched_keys
-            ])
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
             logger.warning(
                 f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
                 f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
                 f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
-                " to use it for predictions and inference.")
+                " to use it for predictions and inference."
+            )
 
         if output_loading_info:
             return model, loading_info
@@ -979,12 +963,13 @@ def from_pretrained_v3(cls,
 
     @classmethod
     def _load_pretrained_model_old(
-            cls,
-            model: PretrainedModel,
-            state_dict: Dict[str, paddle.Tensor],
-            loaded_keys: List[str],
-            ignore_mismatched_sizes=False,
-            dtype=None, ) -> Tuple[List[str]]:
+        cls,
+        model: PretrainedModel,
+        state_dict: Dict[str, paddle.Tensor],
+        loaded_keys: List[str],
+        ignore_mismatched_sizes=False,
+        dtype=None,
+    ) -> Tuple[List[str]]:
         model_state_dict = model.state_dict()
 
         expected_keys = list(model_state_dict.keys())
@@ -992,8 +977,7 @@ def _load_pretrained_model_old(
 
         if len(prefix) > 0:
             has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
-            expects_prefix_module = any(
-                s.startswith(prefix) for s in expected_keys)
+            expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
         else:
             has_prefix_module = False
             expects_prefix_module = False
@@ -1004,10 +988,7 @@ def _load_pretrained_model_old(
         add_prefix_to_model = has_prefix_module and not expects_prefix_module
 
         if remove_prefix_from_model:
-            expected_keys = [
-                ".".join(s.split(".")[1:]) if s.startswith(prefix) else s
-                for s in expected_keys
-            ]
+            expected_keys = [".".join(s.split(".")[1:]) if s.startswith(prefix) else s for s in expected_keys]
         elif add_prefix_to_model:
             expected_keys = [".".join([prefix, s]) for s in expected_keys]
 
@@ -1018,31 +999,26 @@ def _load_pretrained_model_old(
         # the user.
         if cls._keys_to_ignore_on_load_missing is not None:
             for pat in cls._keys_to_ignore_on_load_missing:
-                missing_keys = [
-                    k for k in missing_keys if re.search(pat, k) is None
-                ]
+                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
 
         if cls._keys_to_ignore_on_load_unexpected is not None:
             for pat in cls._keys_to_ignore_on_load_unexpected:
-                unexpected_keys = [
-                    k for k in unexpected_keys if re.search(pat, k) is None
-                ]
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
 
         # Make sure we are able to load base models as well as derived models (with heads)
         start_prefix = ""
         model_to_load = model
-        if (len(cls.base_model_prefix) > 0 and
-                not hasattr(model, cls.base_model_prefix) and
-                has_prefix_module):
+        if len(cls.base_model_prefix) > 0 and not hasattr(model, cls.base_model_prefix) and has_prefix_module:
             start_prefix = cls.base_model_prefix + "."
 
         def _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                loaded_keys,
-                add_prefix_to_model,
-                remove_prefix_from_model,
-                ignore_mismatched_sizes, ):
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            add_prefix_to_model,
+            remove_prefix_from_model,
+            ignore_mismatched_sizes,
+        ):
             mismatched_keys = []
             if ignore_mismatched_sizes:
                 for checkpoint_key in loaded_keys:
@@ -1054,13 +1030,17 @@ def _find_mismatched_keys(
                         # The model key doesn't start with `prefix` but `checkpoint_key` does so we remove it.
                         model_key = ".".join(checkpoint_key.split(".")[1:])
 
-                    if (model_key in model_state_dict and
-                            state_dict[checkpoint_key].shape !=
-                            model_state_dict[model_key].shape):
-                        mismatched_keys.append((
-                            checkpoint_key,
-                            state_dict[checkpoint_key].shape,
-                            model_state_dict[model_key].shape, ))
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (
+                                checkpoint_key,
+                                state_dict[checkpoint_key].shape,
+                                model_state_dict[model_key].shape,
+                            )
+                        )
                         del state_dict[checkpoint_key]
             return mismatched_keys
 
@@ -1071,7 +1051,8 @@ def _find_mismatched_keys(
             loaded_keys,
             add_prefix_to_model,
             remove_prefix_from_model,
-            ignore_mismatched_sizes, )
+            ignore_mismatched_sizes,
+        )
 
         start_prefix = prefix + "."
 
@@ -1090,8 +1071,7 @@ def _find_mismatched_keys(
         if add_prefix_to_model:
             for key in list(state_dict.keys()):
                 if key.startswith(start_prefix):
-                    state_dict[key.replace(start_prefix, "")] = state_dict.pop(
-                        key)
+                    state_dict[key.replace(start_prefix, "")] = state_dict.pop(key)
 
         if remove_prefix_from_model:
             for key in list(state_dict.keys()):
@@ -1126,12 +1106,9 @@ def _find_mismatched_keys(
                     # this is the temp hard code for fused-mt transformer
                     if model.keep_in_fp32_modules(key, model.config, dtype):
                         target_dtype = "float32"
-                    state_dict[key] = paddle.cast(
-                        state_dict[key], dtype=target_dtype)
+                    state_dict[key] = paddle.cast(state_dict[key], dtype=target_dtype)
                 else:
-                    raise ValueError(
-                        f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid"
-                    )
+                    raise ValueError(f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid")
         else:
             dtype_prefix_len = len("paddle.")
             for k, v in model_to_load.state_dict().items():
@@ -1155,8 +1132,7 @@ def _find_mismatched_keys(
         # To avoid recursive import temporarily.
         import paddlenlp.ops.fast_transformer.transformer.decoding as ft_decoding
 
-        state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model(
-            model_to_load, state_dict)
+        state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict)
         if paddle.in_dynamic_mode():
             model_to_load.set_state_dict(state_to_load)
 
@@ -1170,19 +1146,20 @@ def _find_mismatched_keys(
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path,
-            *args,
-            from_hf_hub=False,
-            subfolder=None,
-            paddle_dtype=None,
-            from_diffusers=None,
-            variant=None,
-            **kwargs, ):
+        cls,
+        pretrained_model_name_or_path,
+        *args,
+        from_hf_hub=False,
+        subfolder=None,
+        paddle_dtype=None,
+        from_diffusers=None,
+        variant=None,
+        **kwargs,
+    ):
         try:
             if cls.constructed_from_pretrained_config() and (
-                    hasattr(cls, "smart_convert") or
-                    hasattr(cls, "register_load_torch_hook")):
+                hasattr(cls, "smart_convert") or hasattr(cls, "register_load_torch_hook")
+            ):
                 return from_pretrained_v3(
                     cls,
                     pretrained_model_name_or_path,
@@ -1192,7 +1169,8 @@ def from_pretrained(
                     paddle_dtype=paddle_dtype,
                     from_diffusers=from_diffusers,
                     variant=variant,
-                    **kwargs, )
+                    **kwargs,
+                )
         except Exception:
             pass
 
@@ -1206,7 +1184,8 @@ def from_pretrained(
             from_hf_hub=from_hf_hub,
             subfolder=subfolder,
             dtype=dtype,
-            **kwargs, )
+            **kwargs,
+        )
 
     PretrainedModel.from_pretrained = from_pretrained
 
@@ -1214,51 +1193,43 @@ def from_pretrained(
         from safetensors.numpy import save_file as safetensors_numpy_save_file
 
         if is_torch_available():
-            from safetensors.torch import \
-                save_file as safetensors_torch_save_file
+            from safetensors.torch import save_file as safetensors_torch_save_file
 
     if is_torch_available():
         import torch
 
     def save_pretrained_v3(
-            self: PretrainedModel,
-            save_directory: str,
-            is_main_process: bool=True,
-            save_function: Callable=None,
-            safe_serialization: bool=False,
-            variant: Optional[str]=None,
-            to_diffusers: Optional[bool]=None, ):
-        from ..models.modeling_pytorch_paddle_utils import \
-            convert_paddle_state_dict_to_pytorch
+        self: PretrainedModel,
+        save_directory: str,
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        to_diffusers: Optional[bool] = None,
+    ):
+        from ..models.modeling_pytorch_paddle_utils import (
+            convert_paddle_state_dict_to_pytorch,
+        )
         from ..models.modeling_utils import convert_state_dict
 
         if to_diffusers is None:
             to_diffusers = TO_DIFFUSERS
 
-        if to_diffusers and safe_serialization and not is_safetensors_available(
-        ):
-            raise ImportError(
-                "`safe_serialization` requires the `safetensors library: `pip install safetensors`."
-            )
+        if to_diffusers and safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
 
         if os.path.isfile(save_directory):
-            logger.error(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
 
-        model_to_save = self._layers if isinstance(
-            self, paddle.DataParallel) else self
+        model_to_save = self._layers if isinstance(self, paddle.DataParallel) else self
         if is_main_process:
             try:
-                model_to_save.config.dtype = str(model_to_save._dtype).split(
-                    ".")[-1]
+                model_to_save.config.dtype = str(model_to_save._dtype).split(".")[-1]
             except:
                 model_to_save.config.dtype = "float32"
             # Attach architecture to the config
-            model_to_save.config.architectures = [
-                model_to_save.__class__.__name__
-            ]
+            model_to_save.config.architectures = [model_to_save.__class__.__name__]
 
             model_to_save.config.save_pretrained(save_directory)
 
@@ -1273,12 +1244,10 @@ def save_pretrained_v3(
                 if safe_serialization:
                     if is_torch_available():
                         save_function = safetensors_torch_save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="torch")
+                        state_dict = convert_state_dict(state_dict, framework="torch")
                     else:
                         save_function = safetensors_numpy_save_file
-                        state_dict = convert_state_dict(
-                            state_dict, framework="numpy")
+                        state_dict = convert_state_dict(state_dict, framework="numpy")
                     weights_name = _add_variant("model.safetensors", variant)
                 else:
                     if not is_torch_available():
@@ -1287,11 +1256,9 @@ def save_pretrained_v3(
                         )
                     save_function = torch.save
                     weights_name = _add_variant("pytorch_model.bin", variant)
-                    state_dict = convert_state_dict(
-                        state_dict, framework="torch")
+                    state_dict = convert_state_dict(state_dict, framework="torch")
 
-                state_dict = convert_paddle_state_dict_to_pytorch(state_dict,
-                                                                  model_to_save)
+                state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save)
             else:
                 save_function = paddle.save
                 weights_name = _add_variant("model_state.pdparams", variant)
@@ -1299,24 +1266,22 @@ def save_pretrained_v3(
         # Save the model
         save_function(state_dict, os.path.join(save_directory, weights_name))
 
-        logger.info(
-            f"Model weights saved in {os.path.join(save_directory, weights_name)}"
-        )
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
 
     def save_pretrained(
-            self,
-            save_dir: str,
-            is_main_process: bool=True,
-            state_dict=None,
-            save_function: Callable=None,
-            max_shard_size="10GB",
-            safe_serialization: bool=False,
-            variant: Optional[str]=None,
-            to_diffusers: Optional[bool]=None,
-            *args,
-            **kwargs, ):
-        if self.constructed_from_pretrained_config() and hasattr(
-                self, "smart_convert"):
+        self,
+        save_dir: str,
+        is_main_process: bool = True,
+        state_dict=None,
+        save_function: Callable = None,
+        max_shard_size="10GB",
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        to_diffusers: Optional[bool] = None,
+        *args,
+        **kwargs,
+    ):
+        if self.constructed_from_pretrained_config() and hasattr(self, "smart_convert"):
             return save_pretrained_v3(
                 self,
                 save_dir,
@@ -1324,7 +1289,8 @@ def save_pretrained(
                 save_function=save_function,
                 safe_serialization=safe_serialization,
                 variant=variant,
-                to_diffusers=to_diffusers, )
+                to_diffusers=to_diffusers,
+            )
         return raw_save_pretrained(
             self,
             save_dir=save_dir,
@@ -1335,32 +1301,40 @@ def save_pretrained(
             safe_serialization=safe_serialization,
             variant=variant,
             *args,
-            **kwargs, )
+            **kwargs,
+        )
 
     PretrainedModel.save_pretrained = save_pretrained
 
     from paddlenlp.transformers import (
-        BertModel, BitBackbone, ClapTextModelWithProjection, CLIPTextModel,
-        CLIPTextModelWithProjection, CLIPVisionModel,
-        CLIPVisionModelWithProjection, DPTForDepthEstimation, SpeechT5HifiGan,
-        T5EncoderModel)
+        BertModel,
+        BitBackbone,
+        ClapTextModelWithProjection,
+        CLIPTextModel,
+        CLIPTextModelWithProjection,
+        CLIPVisionModel,
+        CLIPVisionModelWithProjection,
+        DPTForDepthEstimation,
+        SpeechT5HifiGan,
+        T5EncoderModel,
+    )
 
     if not hasattr(T5EncoderModel, "_keep_in_fp32_modules"):
         T5EncoderModel._keep_in_fp32_modules = ["wo"]
 
-    from ..models.modeling_pytorch_paddle_utils import \
-        convert_pytorch_state_dict_to_paddle_class_method
-    from ..pipelines.alt_diffusion.modeling_roberta_series import \
-        RobertaSeriesModelWithTransformation
+    from ..models.modeling_pytorch_paddle_utils import (
+        convert_pytorch_state_dict_to_paddle_class_method,
+    )
+    from ..pipelines.alt_diffusion.modeling_roberta_series import (
+        RobertaSeriesModelWithTransformation,
+    )
     from ..pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
-    from ..pipelines.latent_diffusion.pipeline_latent_diffusion import \
-        LDMBertModel
-    from ..pipelines.paint_by_example.image_encoder import \
-        PaintByExampleImageEncoder
-    from ..pipelines.stable_diffusion.safety_checker import \
-        StableDiffusionSafetyChecker
-    from ..pipelines.stable_diffusion_safe.safety_checker import \
-        SafeStableDiffusionSafetyChecker
+    from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
+    from ..pipelines.paint_by_example.image_encoder import PaintByExampleImageEncoder
+    from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+    from ..pipelines.stable_diffusion_safe.safety_checker import (
+        SafeStableDiffusionSafetyChecker,
+    )
 
     @classmethod
     def clip_smart_convert(cls, state_dict, pd_model):
@@ -1380,7 +1354,9 @@ def clip_smart_convert(cls, state_dict, pd_model):
             ".pre_layrnorm.": ".ln_pre.",
             ".post_layernorm.": ".ln_post.",
         }
-        ignore_value = ["position_ids", ]
+        ignore_value = [
+            "position_ids",
+        ]
         if cls in [PaintByExampleImageEncoder]:
             # ignore mapper. prefix, we will use convert_pytorch_state_dict_to_paddle to convert mapper.xxxx state_dict
             ignore_value.append("mapper.")
@@ -1410,11 +1386,11 @@ def clip_smart_convert(cls, state_dict, pd_model):
                 name = name.replace(hf_name, ppnlp_name)
             # step4: 0d tensor -> 1d tensor
             if name == "logit_scale" and value.ndim == 1:
-                value = value.reshape((1, ))
+                value = value.reshape((1,))
             # step5: safety_checker need prefix "clip."
             if "vision_model" in name and cls in [
-                    StableDiffusionSafetyChecker,
-                    SafeStableDiffusionSafetyChecker,
+                StableDiffusionSafetyChecker,
+                SafeStableDiffusionSafetyChecker,
             ]:
                 name = "clip." + name
             new_model_state[name] = value
@@ -1423,8 +1399,7 @@ def clip_smart_convert(cls, state_dict, pd_model):
 
         if cls in [PaintByExampleImageEncoder]:
             # convert mapper
-            mappersd = cls.smart_convert(
-                state_dict, pd_model, sub_layer="mapper.")
+            mappersd = cls.smart_convert(state_dict, pd_model, sub_layer="mapper.")
             new_model_state.update(mappersd)
 
         return new_model_state
@@ -1451,10 +1426,8 @@ def bert_smart_convert(cls, state_dict, pd_model):
             # about cls predictions ignore
             "cls.predictions.transform.dense": "cls.predictions.transform",
             "cls.predictions.decoder.weight": "cls.predictions.decoder_weight",
-            "cls.predictions.transform.LayerNorm.weight":
-            "cls.predictions.layer_norm.weight",
-            "cls.predictions.transform.LayerNorm.bias":
-            "cls.predictions.layer_norm.bias",
+            "cls.predictions.transform.LayerNorm.weight": "cls.predictions.layer_norm.weight",
+            "cls.predictions.transform.LayerNorm.bias": "cls.predictions.layer_norm.bias",
             "cls.predictions.bias": "cls.predictions.decoder_bias",
         }
         ignore_value = ["position_ids"]
@@ -1481,8 +1454,7 @@ def bert_smart_convert(cls, state_dict, pd_model):
     def ldmbert_smart_convert(cls, state_dict, pd_model):
         transformers2ppnlp = {
             "model.embed_tokens.weight": "embeddings.word_embeddings.weight",
-            "model.embed_positions.weight":
-            "embeddings.position_embeddings.weight",
+            "model.embed_positions.weight": "embeddings.position_embeddings.weight",
             "model.layer_norm.": "final_layer_norm.",
             "model.layers": "encoder.layers",
             ".self_attn_layer_norm.": ".norm1.",
@@ -1513,14 +1485,14 @@ def ldmbert_smart_convert(cls, state_dict, pd_model):
 
     LDMBertModel.smart_convert = ldmbert_smart_convert
     for cls_ in [
-            CLIPTextModel,
-            CLIPTextModelWithProjection,
-            CLIPVisionModel,
-            CLIPVisionModelWithProjection,
-            StableDiffusionSafetyChecker,
-            SafeStableDiffusionSafetyChecker,
-            PaintByExampleImageEncoder,
-            IFSafetyChecker,
+        CLIPTextModel,
+        CLIPTextModelWithProjection,
+        CLIPVisionModel,
+        CLIPVisionModelWithProjection,
+        StableDiffusionSafetyChecker,
+        SafeStableDiffusionSafetyChecker,
+        PaintByExampleImageEncoder,
+        IFSafetyChecker,
     ]:
         setattr(cls_, "smart_convert", clip_smart_convert)
 
@@ -1532,8 +1504,12 @@ def ldmbert_smart_convert(cls, state_dict, pd_model):
     else:
         # NEW TRANSFORMERS CLIP MODEL
         from ..pipelines.stable_diffusion.hf_clip_model import (
-            HFCLIPModel, HFCLIPTextModel, HFCLIPTextModelWithProjection,
-            HFCLIPVisionModel, HFCLIPVisionModelWithProjection)
+            HFCLIPModel,
+            HFCLIPTextModel,
+            HFCLIPTextModelWithProjection,
+            HFCLIPVisionModel,
+            HFCLIPVisionModelWithProjection,
+        )
 
         TRANSFORMERS_CLIP_MODEL = [
             HFCLIPModel,
@@ -1543,29 +1519,27 @@ def ldmbert_smart_convert(cls, state_dict, pd_model):
             HFCLIPVisionModelWithProjection,
         ]
     for cls_ in [
-            DPTForDepthEstimation,
-            BitBackbone,
-            SpeechT5HifiGan,
-            ClapTextModelWithProjection,
-            T5EncoderModel,
+        DPTForDepthEstimation,
+        BitBackbone,
+        SpeechT5HifiGan,
+        ClapTextModelWithProjection,
+        T5EncoderModel,
     ] + TRANSFORMERS_CLIP_MODEL:
-        setattr(cls_, "smart_convert",
-                convert_pytorch_state_dict_to_paddle_class_method)
+        setattr(cls_, "smart_convert", convert_pytorch_state_dict_to_paddle_class_method)
 
     # TODO remove this when we updage ImageProcessingMixin
     # patch get_image_processor_dict support subfolder.
 
     IMAGE_PROCESSOR_NAME = "preprocessor_config.json"
-    from paddlenlp.transformers.feature_extraction_utils import \
-        FeatureExtractionMixin
-    from paddlenlp.transformers.image_processing_utils import \
-        ImageProcessingMixin
+    from paddlenlp.transformers.feature_extraction_utils import FeatureExtractionMixin
+    from paddlenlp.transformers.image_processing_utils import ImageProcessingMixin
 
     @classmethod
     def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs):
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
@@ -1589,12 +1563,11 @@ def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs):
             revision=revision,
             subfolder=subfolder,
             user_agent=user_agent,
-            from_hf_hub=from_hf_hub, )
+            from_hf_hub=from_hf_hub,
+        )
         try:
             # Load image_processor dict
-            with open(
-                    resolved_image_processor_file, "r",
-                    encoding="utf-8") as reader:
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
                 text = reader.read()
             image_processor_dict = json.loads(text)
 
diff --git a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
index 367da2b281b53..7000346e862f7 100644
--- a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
+++ b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
@@ -32,41 +32,36 @@
 
 
 def scatter_reduce(
-        input: paddle.Tensor,
-        dim: int,
-        index: paddle.Tensor,
-        src: paddle.Tensor,
-        reduce: str="mean",
-        include_self: bool=True, ) -> paddle.Tensor:
+    input: paddle.Tensor,
+    dim: int,
+    index: paddle.Tensor,
+    src: paddle.Tensor,
+    reduce: str = "mean",
+    include_self: bool = True,
+) -> paddle.Tensor:
     # reduce "sum", "prod", "mean",
     # TODO support "amax", "amin" and include_self = False
     if reduce in ["sum", "assign", "add"]:
         if reduce == "sum":
             reduce = "add"
-        input.put_along_axis_(
-            indices=index, values=src, axis=dim, reduce=reduce)
+        input.put_along_axis_(indices=index, values=src, axis=dim, reduce=reduce)
     elif reduce == "mean":
         # compute sum first
         input.put_along_axis_(indices=index, values=src, axis=dim, reduce="add")
         # compute div secondly
         input_div = paddle.ones_like(input).put_along_axis(
             indices=index,
-            values=paddle.to_tensor(
-                1.0, dtype=input.dtype),
+            values=paddle.to_tensor(1.0, dtype=input.dtype),
             axis=dim,
-            reduce="add", )
+            reduce="add",
+        )
         input = input / input_div
     elif reduce in ["prod", "mul", "multiply"]:
-        input = paddle.put_along_axis(
-            input.cpu(),
-            indices=index.cpu(),
-            values=src.cpu(),
-            axis=dim,
-            reduce="mul")._to(device=paddle.get_device())
-    else:
-        raise NotImplementedError(
-            "only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!"
+        input = paddle.put_along_axis(input.cpu(), indices=index.cpu(), values=src.cpu(), axis=dim, reduce="mul")._to(
+            device=paddle.get_device()
         )
+    else:
+        raise NotImplementedError("only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!")
     return input
 
 
@@ -75,18 +70,19 @@ def scatter_reduce(
 paddle.Tensor.scatter_reduce = scatter_reduce
 
 
-def do_nothing(x: paddle.Tensor, mode: str=None):
+def do_nothing(x: paddle.Tensor, mode: str = None):
     return x
 
 
 def bipartite_soft_matching_random2d(
-        metric: paddle.Tensor,
-        w: int,
-        h: int,
-        sx: int,
-        sy: int,
-        r: int,
-        no_rand: bool=False, ) -> Tuple[Callable, Callable]:
+    metric: paddle.Tensor,
+    w: int,
+    h: int,
+    sx: int,
+    sy: int,
+    r: int,
+    no_rand: bool = False,
+) -> Tuple[Callable, Callable]:
     """
     Partitions the tokens into src and dst and merges r tokens from src to dst.
     Dst tokens are partitioned by choosing one randomy in each (sx, sy) region.
@@ -112,24 +108,23 @@ def bipartite_soft_matching_random2d(
         if no_rand:
             rand_idx = paddle.zeros((hsy, wsx, 1), dtype=paddle.int64)
         else:
-            rand_idx = paddle.randint(
-                sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64)
+            rand_idx = paddle.randint(sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64)
 
         # The image might not divide sx and sy, so we need to work on a view of the top left if the idx buffer instead
         idx_buffer_view = paddle.zeros([hsy, wsx, sy * sx], dtype=paddle.int64)
         idx_buffer_view.put_along_axis_(
             axis=2,
             indices=rand_idx,
-            values=-paddle.ones_like(
-                rand_idx, dtype=rand_idx.dtype), )
-        idx_buffer_view = (idx_buffer_view.reshape([hsy, wsx, sy, sx])
-                           .transpose([0, 2, 1, 3])
-                           .reshape([hsy * sy, wsx * sx]))
+            values=-paddle.ones_like(rand_idx, dtype=rand_idx.dtype),
+        )
+        idx_buffer_view = (
+            idx_buffer_view.reshape([hsy, wsx, sy, sx]).transpose([0, 2, 1, 3]).reshape([hsy * sy, wsx * sx])
+        )
 
         # Image is not divisible by sx or sy so we need to move it into a new buffer
         if (hsy * sy) < h or (wsx * sx) < w:
             idx_buffer = paddle.zeros([h, w], dtype=paddle.int64)
-            idx_buffer[:(hsy * sy), :(wsx * sx)] = idx_buffer_view
+            idx_buffer[: (hsy * sy), : (wsx * sx)] = idx_buffer_view
         else:
             idx_buffer = idx_buffer_view
 
@@ -147,10 +142,8 @@ def bipartite_soft_matching_random2d(
         def split(x):
             C = x.shape[-1]
 
-            src = x.take_along_axis(
-                indices=a_idx.expand([B, N - num_dst, C]), axis=1)
-            dst = x.take_along_axis(
-                indices=b_idx.expand([B, num_dst, C]), axis=1)
+            src = x.take_along_axis(indices=a_idx.expand([B, N - num_dst, C]), axis=1)
+            dst = x.take_along_axis(indices=b_idx.expand([B, num_dst, C]), axis=1)
             return src, dst
 
         # Cosine similarity between A and B
@@ -178,12 +171,10 @@ def merge(x: paddle.Tensor, mode="mean") -> paddle.Tensor:
         src, dst = split(x)
         n, t1, c = src.shape
 
-        unm = src.take_along_axis(
-            indices=unm_idx.expand([n, t1 - r, c]), axis=-2)
+        unm = src.take_along_axis(indices=unm_idx.expand([n, t1 - r, c]), axis=-2)
         src = src.take_along_axis(indices=src_idx.expand([n, r, c]), axis=-2)
 
-        dst = scatter_reduce(
-            dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode)
+        dst = scatter_reduce(dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode)
 
         return paddle.concat([unm, dst], axis=1)
 
@@ -200,25 +191,27 @@ def unmerge(x: paddle.Tensor) -> paddle.Tensor:
         out.put_along_axis_(
             indices=b_idx.expand([B, num_dst, c]),
             values=dst,
-            axis=-2, )
+            axis=-2,
+        )
         out.put_along_axis_(
-            indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(
-                indices=unm_idx, axis=1).expand([B, unm_len, c]),
+            indices=a_idx.expand([B, a_idx.shape[1], 1])
+            .take_along_axis(indices=unm_idx, axis=1)
+            .expand([B, unm_len, c]),
             values=unm,
-            axis=-2, )
+            axis=-2,
+        )
         out.put_along_axis_(
-            indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(
-                indices=src_idx, axis=1).expand([B, r, c]),
+            indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(indices=src_idx, axis=1).expand([B, r, c]),
             values=src,
-            axis=-2, )
+            axis=-2,
+        )
 
         return out
 
     return merge, unmerge
 
 
-def compute_merge(x: paddle.Tensor,
-                  tome_info: Dict[str, Any]) -> Tuple[Callable, ...]:
+def compute_merge(x: paddle.Tensor, tome_info: Dict[str, Any]) -> Tuple[Callable, ...]:
     original_h, original_w = tome_info["size"]
     original_tokens = original_h * original_w
     downsample = int(math.ceil(math.sqrt(original_tokens // x.shape[1])))
@@ -232,8 +225,7 @@ def compute_merge(x: paddle.Tensor,
         # If the batch size is odd, then it's not possible for promted and unprompted images to be in the same
         # batch, which causes artifacts with use_rand, so force it to be off.
         use_rand = False if x.shape[0] % 2 == 1 else args["use_rand"]
-        m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"],
-                                                r, not use_rand)
+        m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"], r, not use_rand)
     else:
         m, u = (do_nothing, do_nothing)
 
@@ -255,31 +247,27 @@ class ToMeBasicTransformerBlock(block_class):
         _parent = block_class
 
         def forward(
-                self: BasicTransformerBlock,
-                hidden_states,
-                attention_mask=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                timestep=None,
-                cross_attention_kwargs=None,
-                class_labels=None, ) -> paddle.Tensor:
+            self: BasicTransformerBlock,
+            hidden_states,
+            attention_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            timestep=None,
+            cross_attention_kwargs=None,
+            class_labels=None,
+        ) -> paddle.Tensor:
             # (1) ToMe
-            m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states,
-                                                         self._tome_info)
+            m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states, self._tome_info)
 
             if self.use_ada_layer_norm:
                 norm_hidden_states = self.norm1(hidden_states, timestep)
             elif self.use_ada_layer_norm_zero:
-                (
-                    norm_hidden_states,
-                    gate_msa,
-                    shift_mlp,
-                    scale_mlp,
-                    gate_mlp, ) = self.norm1(
-                        hidden_states,
-                        timestep,
-                        class_labels,
-                        hidden_dtype=hidden_states.dtype, )
+                (norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp,) = self.norm1(
+                    hidden_states,
+                    timestep,
+                    class_labels,
+                    hidden_dtype=hidden_states.dtype,
+                )
             else:
                 norm_hidden_states = self.norm1(hidden_states)
 
@@ -287,15 +275,13 @@ def forward(
             norm_hidden_states = m_a(norm_hidden_states)
 
             # 1. Self-Attention
-            cross_attention_kwargs = (cross_attention_kwargs
-                                      if cross_attention_kwargs is not None else
-                                      {})
+            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
             attn_output = self.attn1(
                 norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states
-                if self.only_cross_attention else None,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
             if self.use_ada_layer_norm_zero:
                 attn_output = gate_msa.unsqueeze(1) * attn_output
 
@@ -303,9 +289,9 @@ def forward(
             hidden_states = u_a(attn_output) + hidden_states
 
             if self.attn2 is not None:
-                norm_hidden_states = (self.norm2(hidden_states, timestep)
-                                      if self.use_ada_layer_norm else
-                                      self.norm2(hidden_states))
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
                 # (4) ToMe m_c
                 norm_hidden_states = m_c(norm_hidden_states)
 
@@ -314,7 +300,8 @@ def forward(
                     norm_hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=encoder_attention_mask,
-                    **cross_attention_kwargs, )
+                    **cross_attention_kwargs,
+                )
                 # (5) ToMe u_c
                 hidden_states = u_c(attn_output) + hidden_states
 
@@ -322,9 +309,7 @@ def forward(
             norm_hidden_states = self.norm3(hidden_states)
 
             if self.use_ada_layer_norm_zero:
-                norm_hidden_states = (
-                    norm_hidden_states *
-                    (1 + scale_mlp[:, None]) + shift_mlp[:, None])
+                norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
 
             # (6) ToMe m_m
             norm_hidden_states = m_m(norm_hidden_states)
@@ -353,8 +338,7 @@ def hook(module, args):
 
 
 @patch_to([DiffusionPipeline, nn.Layer])
-def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline],
-                only_return_self: bool=True):
+def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline], only_return_self: bool = True):
     """Removes a patch from a ToMeXXX module if it was already patched."""
     model_list = []
     if isinstance(model_or_pipe, DiffusionPipeline):
@@ -385,15 +369,16 @@ def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline],
 
 @patch_to([DiffusionPipeline, nn.Layer])
 def apply_tome(
-        model_or_pipe: Union[nn.Layer, DiffusionPipeline],
-        ratio: float=0.5,
-        max_downsample: int=1,
-        sx: int=2,
-        sy: int=2,
-        use_rand: bool=True,
-        merge_attn: bool=True,
-        merge_crossattn: bool=False,
-        merge_mlp: bool=False, ):
+    model_or_pipe: Union[nn.Layer, DiffusionPipeline],
+    ratio: float = 0.5,
+    max_downsample: int = 1,
+    sx: int = 2,
+    sy: int = 2,
+    use_rand: bool = True,
+    merge_attn: bool = True,
+    merge_crossattn: bool = False,
+    merge_mlp: bool = False,
+):
     """
     Patches a stable diffusion model_or_pipe with ToMe.
     Apply this to the highest level stable diffusion object (i.e., it should have a .unet).
diff --git a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
index 6cc870cfb75ee..89a574fe97842 100644
--- a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
+++ b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
@@ -42,7 +42,7 @@ def convert_pt_to_pd(state, dtype):
             if val.ndim == 2:
                 val = val.T
             if val.ndim == 0:
-                val = val.reshape((1, ))
+                val = val.reshape((1,))
             new_state[b] = val.cast(dtype)
         else:
             print(f"We find {a} not in state_dict and we will continue!")
@@ -87,12 +87,10 @@ def save_lora(pipe_or_module, save_directory, WEIGHT_NAME=None):
 
     if is_torch_available():
         save_function = safetensors.torch.save_file
-        outdict = convert_state_dict(
-            convert_pd_to_pt(outdict), framework="torch")
+        outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="torch")
     else:
         save_function = safetensors.numpy.save_file
-        outdict = convert_state_dict(
-            convert_pd_to_pt(outdict), framework="numpy")
+        outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="numpy")
 
     save_function(outdict, os.path.join(save_directory, WEIGHT_NAME))
     del outdict
@@ -116,15 +114,16 @@ def set_lora(self):
 
 @patch_to([DiffusionPipeline, nn.Layer])
 def apply_lora(
-        pipe_or_module,
-        lora_weight_or_path=None,
-        rank=4,
-        alpha=None,
-        multiplier=1.0,
-        text_encoder_target_replace_modules=["TransformerEncoderLayer"],
-        unet_target_replace_modules=["Transformer2DModel", "Attention"],
-        enable_lora=True,
-        **kwargs, ):
+    pipe_or_module,
+    lora_weight_or_path=None,
+    rank=4,
+    alpha=None,
+    multiplier=1.0,
+    text_encoder_target_replace_modules=["TransformerEncoderLayer"],
+    unet_target_replace_modules=["Transformer2DModel", "Attention"],
+    enable_lora=True,
+    **kwargs,
+):
     resume_download = kwargs.pop("resume_download", False)
     force_download = kwargs.pop("force_download", False)
     paddle_dtype = kwargs.pop("paddle_dtype", None)
@@ -143,17 +142,16 @@ def apply_lora(
         lora_weight_or_path = str(lora_weight_or_path)
         if os.path.isfile(lora_weight_or_path):
             lora_weight_or_path = lora_weight_or_path
-        elif lora_weight_or_path.startswith(
-                "http://") or lora_weight_or_path.startswith("https://"):
+        elif lora_weight_or_path.startswith("http://") or lora_weight_or_path.startswith("https://"):
             lora_weight_or_path = ppdiffusers_url_download(
                 lora_weight_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
-                force_download=force_download, )
+                force_download=force_download,
+            )
         else:
             raise EnvironmentError(f"Please check your {lora_weight_or_path}.")
-        lora_weight_or_path = convert_pt_to_pd(
-            smart_load(lora_weight_or_path), paddle_dtype)
+        lora_weight_or_path = convert_pt_to_pd(smart_load(lora_weight_or_path), paddle_dtype)
 
         mayberanklist = []
         maybealphalist = []
@@ -176,67 +174,64 @@ def apply_lora(
             if len(mayberanklist) > 20:
                 break
         if len(set(mayberanklist)) > 1:
-            print(
-                f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}."
-            )
+            print(f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}.")
         else:
             rank = mayberanklist[0]
         print(f"|---------------Currently, rank is {rank}!")
 
         if len(set(maybealphalist)) > 1:
-            print(
-                f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}"
-            )
+            print(f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}")
         else:
             alpha = maybealphalist[0]
         print(f"|---------------Currently, alpha is {alpha}!")
 
     waitlist = []
     if isinstance(pipe_or_module, nn.Layer):
-        waitlist.append((
-            pipe_or_module,
-            text_encoder_target_replace_modules + unet_target_replace_modules,
-        ))
+        waitlist.append(
+            (
+                pipe_or_module,
+                text_encoder_target_replace_modules + unet_target_replace_modules,
+            )
+        )
     else:
         if hasattr(pipe_or_module, "text_encoder"):
-            waitlist.append((pipe_or_module.text_encoder,
-                             text_encoder_target_replace_modules))
+            waitlist.append((pipe_or_module.text_encoder, text_encoder_target_replace_modules))
         if hasattr(pipe_or_module, "unet"):
             waitlist.append((pipe_or_module.unet, unet_target_replace_modules))
     lora_modules = {}
     for each_module, target_replace_modules in waitlist:
         for name1, module in each_module.named_sublayers(include_self=True):
             if module.__class__.__name__ in target_replace_modules:
-                for name2, child_module in module.named_sublayers(
-                        include_self=True):
+                for name2, child_module in module.named_sublayers(include_self=True):
                     if not getattr(child_module, "is_lora_linear", False) and (
-                            child_module.__class__.__name__ == "Linear" or
-                        (child_module.__class__.__name__ == "Conv2D" and
-                         list(child_module._kernel_size) == [1, 1])):
+                        child_module.__class__.__name__ == "Linear"
+                        or (child_module.__class__.__name__ == "Conv2D" and list(child_module._kernel_size) == [1, 1])
+                    ):
                         # if we apply lora multi
-                        if hasattr(child_module,
-                                   "merged") and child_module.merged:
+                        if hasattr(child_module, "merged") and child_module.merged:
                             with paddle.no_grad():
                                 if child_module.is_conv:
                                     new_weight = (
-                                        child_module.weight.squeeze([-1, -2]) -
-                                        child_module.lora_up.weight.squeeze(
-                                            [-1, -2])
-                                        @child_module.lora_down.weight.squeeze(
-                                            [-1, -2]) * child_module.multiplier
-                                        * child_module.scale).unsqueeze(
-                                            [-1, -2])
+                                        child_module.weight.squeeze([-1, -2])
+                                        - child_module.lora_up.weight.squeeze([-1, -2])
+                                        @ child_module.lora_down.weight.squeeze([-1, -2])
+                                        * child_module.multiplier
+                                        * child_module.scale
+                                    ).unsqueeze([-1, -2])
                                 else:
-                                    new_weight = (child_module.weight -
-                                                  child_module.lora_down.weight
-                                                  @child_module.lora_up.weight *
-                                                  child_module.multiplier *
-                                                  child_module.scale)
+                                    new_weight = (
+                                        child_module.weight
+                                        - child_module.lora_down.weight
+                                        @ child_module.lora_up.weight
+                                        * child_module.multiplier
+                                        * child_module.scale
+                                    )
                                 child_module.weight.set_value(new_weight)
 
                         in_features, out_features = (
                             child_module.weight.shape[0],
-                            child_module.weight.shape[1], )
+                            child_module.weight.shape[1],
+                        )
                         child_module.is_conv = False
                         child_module.merged = False
 
@@ -250,15 +245,11 @@ def apply_lora(
                             )
 
                         if child_module.is_conv:
-                            child_module.lora_down = nn.Conv2D(
-                                in_features, rank, [1, 1], bias_attr=False)
-                            child_module.lora_up = nn.Conv2D(
-                                rank, out_features, [1, 1], bias_attr=False)
+                            child_module.lora_down = nn.Conv2D(in_features, rank, [1, 1], bias_attr=False)
+                            child_module.lora_up = nn.Conv2D(rank, out_features, [1, 1], bias_attr=False)
                         else:
-                            child_module.lora_down = nn.Linear(
-                                in_features, rank, bias_attr=False)
-                            child_module.lora_up = nn.Linear(
-                                rank, out_features, bias_attr=False)
+                            child_module.lora_down = nn.Linear(in_features, rank, bias_attr=False)
+                            child_module.lora_up = nn.Linear(rank, out_features, bias_attr=False)
                         child_module.lora_down.is_lora_linear = True
                         child_module.lora_up.is_lora_linear = True
                         child_module.rank = rank
@@ -268,13 +259,10 @@ def apply_lora(
                             alpha = alpha.detach().cast("float32").numpy()
                         alpha = rank if alpha is None or alpha == 0 else alpha
                         child_module.scale = alpha / child_module.rank
-                        child_module.register_buffer(
-                            "alpha", paddle.to_tensor(
-                                alpha, dtype="float32"))
+                        child_module.register_buffer("alpha", paddle.to_tensor(alpha, dtype="float32"))
 
                         # same as microsoft's
-                        kaiming_uniform_(
-                            child_module.lora_down.weight, a=math.sqrt(5))
+                        kaiming_uniform_(child_module.lora_down.weight, a=math.sqrt(5))
                         zeros_(child_module.lora_up.weight)
                         child_module.multiplier = multiplier
 
@@ -287,44 +275,47 @@ def forward_lora(self, x):
                                     with paddle.no_grad():
                                         if self.is_conv:
                                             new_weight = (
-                                                self.weight.squeeze([-1, -2]) -
-                                                self.lora_up.weight.squeeze(
-                                                    [-1, -2])
-                                                @self.lora_down.weight.squeeze(
-                                                    [-1, -2]) * self.multiplier
-                                                * self.scale).unsqueeze(
-                                                    [-1, -2])
+                                                self.weight.squeeze([-1, -2])
+                                                - self.lora_up.weight.squeeze([-1, -2])
+                                                @ self.lora_down.weight.squeeze([-1, -2])
+                                                * self.multiplier
+                                                * self.scale
+                                            ).unsqueeze([-1, -2])
                                         else:
                                             new_weight = (
-                                                self.weight -
-                                                self.lora_down.weight
-                                                @self.lora_up.weight *
-                                                self.multiplier * self.scale)
+                                                self.weight
+                                                - self.lora_down.weight
+                                                @ self.lora_up.weight
+                                                * self.multiplier
+                                                * self.scale
+                                            )
                                         self.weight.set_value(new_weight)
                                         self.merged = False
                                 if not self.enable_lora:
                                     return self.raw_forward(x)
-                                return (self.raw_forward(x) +
-                                        self.lora_up(self.lora_down(x)) *
-                                        self.multiplier * self.scale)
+                                return (
+                                    self.raw_forward(x)
+                                    + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+                                )
                             else:
                                 if self.enable_lora and not self.merged:
                                     with paddle.no_grad():
                                         if self.is_conv:
                                             new_weight = (
-                                                self.weight.squeeze([-1, -2]) +
-                                                self.lora_up.weight.squeeze(
-                                                    [-1, -2])
-                                                @self.lora_down.weight.squeeze(
-                                                    [-1, -2]) * self.multiplier
-                                                * self.scale).unsqueeze(
-                                                    [-1, -2])
+                                                self.weight.squeeze([-1, -2])
+                                                + self.lora_up.weight.squeeze([-1, -2])
+                                                @ self.lora_down.weight.squeeze([-1, -2])
+                                                * self.multiplier
+                                                * self.scale
+                                            ).unsqueeze([-1, -2])
                                         else:
                                             new_weight = (
-                                                self.weight +
-                                                self.lora_down.weight
-                                                @self.lora_up.weight *
-                                                self.multiplier * self.scale)
+                                                self.weight
+                                                + self.lora_down.weight
+                                                @ self.lora_up.weight
+                                                * self.multiplier
+                                                * self.scale
+                                            )
                                         self.weight.set_value(new_weight)
                                         self.merged = True
 
@@ -332,25 +323,25 @@ def forward_lora(self, x):
                                     with paddle.no_grad():
                                         if self.is_conv:
                                             new_weight = (
-                                                self.weight.squeeze([-1, -2]) -
-                                                self.lora_up.weight.squeeze(
-                                                    [-1, -2])
-                                                @self.lora_down.weight.squeeze(
-                                                    [-1, -2]) * self.multiplier
-                                                * self.scale).unsqueeze(
-                                                    [-1, -2])
+                                                self.weight.squeeze([-1, -2])
+                                                - self.lora_up.weight.squeeze([-1, -2])
+                                                @ self.lora_down.weight.squeeze([-1, -2])
+                                                * self.multiplier
+                                                * self.scale
+                                            ).unsqueeze([-1, -2])
                                         else:
                                             new_weight = (
-                                                self.weight -
-                                                self.lora_down.weight
-                                                @self.lora_up.weight *
-                                                self.multiplier * self.scale)
+                                                self.weight
+                                                - self.lora_down.weight
+                                                @ self.lora_up.weight
+                                                * self.multiplier
+                                                * self.scale
+                                            )
                                         self.weight.set_value(new_weight)
                                         self.merged = False
                                 return self.raw_forward(x)
 
-                        child_module.forward = MethodType(forward_lora,
-                                                          child_module)
+                        child_module.forward = MethodType(forward_lora, child_module)
                         child_module.lora_down.training = child_module.training
                         child_module.lora_up.training = child_module.training
                         child_module.to(dtype=paddle_dtype)
diff --git a/ppdiffusers/ppdiffusers/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipeline_utils.py
index 4ddfca40ac392..48a455def8412 100644
--- a/ppdiffusers/ppdiffusers/pipeline_utils.py
+++ b/ppdiffusers/ppdiffusers/pipeline_utils.py
@@ -18,4 +18,4 @@
 # It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
 
 from .pipelines import ImagePipelineOutput  # noqa: F401
-from .pipelines import DiffusionPipeline, TextPipelineOutput
+from .pipelines import DiffusionPipeline, TextPipelineOutput  # noqa: F401
diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py
index 3c7b73e5fcf47..db10dd5dccfe7 100644
--- a/ppdiffusers/ppdiffusers/pipelines/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/__init__.py
@@ -13,10 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils import (OptionalDependencyNotAvailable, is_einops_available,
-                     is_fastdeploy_available, is_k_diffusion_available,
-                     is_librosa_available, is_note_seq_available,
-                     is_paddle_available, is_paddlenlp_available)
+from ..utils import (
+    OptionalDependencyNotAvailable,
+    is_einops_available,
+    is_fastdeploy_available,
+    is_k_diffusion_available,
+    is_librosa_available,
+    is_note_seq_available,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 try:
     if not is_paddle_available():
@@ -30,8 +36,12 @@
     from .dit import DiTPipeline
     from .latent_diffusion import LDMSuperResolutionPipeline
     from .latent_diffusion_uncond import LDMPipeline
-    from .pipeline_utils import (AudioPipelineOutput, DiffusionPipeline,
-                                 ImagePipelineOutput, TextPipelineOutput)
+    from .pipeline_utils import (
+        AudioPipelineOutput,
+        DiffusionPipeline,
+        ImagePipelineOutput,
+        TextPipelineOutput,
+    )
     from .pndm import PNDMPipeline
     from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
@@ -51,38 +61,52 @@
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
 else:
-    from .alt_diffusion import (AltDiffusionImg2ImgPipeline,
-                                AltDiffusionPipeline)
+    from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
     from .audioldm import AudioLDMPipeline
     from .deepfloyd_if import (
-        IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline,
-        IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, IFPipeline,
-        IFSuperResolutionPipeline)
+        IFImg2ImgPipeline,
+        IFImg2ImgSuperResolutionPipeline,
+        IFInpaintingPipeline,
+        IFInpaintingSuperResolutionPipeline,
+        IFPipeline,
+        IFSuperResolutionPipeline,
+    )
     from .latent_diffusion import LDMTextToImagePipeline
     from .lvdm import LVDMTextToVideoPipeline, LVDMUncondPipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
     from .stable_diffusion import (
-        CycleDiffusionPipeline, StableDiffusionAdapterPipeline,
+        CycleDiffusionPipeline,
+        StableDiffusionAdapterPipeline,
         StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionControlNetPipeline, StableDiffusionDepth2ImgPipeline,
-        StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionDepth2ImgPipeline,
+        StableDiffusionImageVariationPipeline,
+        StableDiffusionImg2ImgPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionInpaintPipelineLegacy,
         StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline, StableDiffusionMegaPipeline,
-        StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline,
-        StableDiffusionPipeline, StableDiffusionPipelineAllinOne,
-        StableDiffusionPix2PixZeroPipeline, StableDiffusionSAGPipeline,
-        StableDiffusionUpscalePipeline, StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline)
+        StableDiffusionLatentUpscalePipeline,
+        StableDiffusionMegaPipeline,
+        StableDiffusionModelEditingPipeline,
+        StableDiffusionPanoramaPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionPipelineAllinOne,
+        StableDiffusionPix2PixZeroPipeline,
+        StableDiffusionSAGPipeline,
+        StableDiffusionUpscalePipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
-    from .text_to_video_synthesis import (TextToVideoSDPipeline,
-                                          TextToVideoZeroPipeline)
+    from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
     from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-    from .versatile_diffusion import (VersatileDiffusionDualGuidedPipeline,
-                                      VersatileDiffusionImageVariationPipeline,
-                                      VersatileDiffusionPipeline,
-                                      VersatileDiffusionTextToImagePipeline)
+    from .versatile_diffusion import (
+        VersatileDiffusionDualGuidedPipeline,
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+    )
     from .vq_diffusion import VQDiffusionPipeline
 
 try:
@@ -91,12 +115,13 @@
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_fastdeploy_objects import *  # noqa F403
 else:
-    from .fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                   FastDeployRuntimeModel)
+    from .fastdeploy_utils import (
+        FastDeployDiffusionPipelineMixin,
+        FastDeployRuntimeModel,
+    )
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_fastdeploy_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import *  # noqa F403
@@ -110,11 +135,11 @@
         FastDeployStableDiffusionInpaintPipelineLegacy,
         FastDeployStableDiffusionMegaPipeline,
         FastDeployStableDiffusionPipeline,
-        FastDeployStableDiffusionUpscalePipeline)
+        FastDeployStableDiffusionUpscalePipeline,
+    )
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_k_diffusion_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import *  # noqa F403
@@ -122,8 +147,7 @@
     from .stable_diffusion import StableDiffusionKDiffusionPipeline
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_einops_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_and_paddlenlp_and_einops_objects import *  # noqa F403
@@ -131,11 +155,9 @@
     from .unidiffuser import UniDiffuserPipeline
 
 try:
-    if not (is_paddle_available() and is_paddlenlp_available() and
-            is_note_seq_available()):
+    if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *  # noqa F403
 else:
-    from .spectrogram_diffusion import (MidiProcessor,
-                                        SpectrogramDiffusionPipeline)
+    from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
index 087da16f84c37..70cd40778b488 100644
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
+++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
@@ -24,9 +24,7 @@
 from paddlenlp.transformers.model_outputs import ModelOutput
 
 
-def create_position_ids_from_input_ids(input_ids,
-                                       padding_idx,
-                                       past_key_values_length=0):
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
     """
     Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
     are ignored. This is modified from fairseq's `utils.make_positions`.
@@ -38,8 +36,7 @@ def create_position_ids_from_input_ids(input_ids,
     """
     # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     mask = (input_ids != padding_idx).cast("int64")
-    incremental_indices = (paddle.cumsum(
-        mask, axis=1) + past_key_values_length) * mask
+    incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask
     return incremental_indices + padding_idx
 
 
@@ -76,21 +73,23 @@ class RobertaSeriesConfig(XLMRobertaConfig):
     model_type = "roberta"
 
     def __init__(
-            self,
-            pad_token_id=1,
-            bos_token_id=0,
-            eos_token_id=2,
-            project_dim=512,
-            pooler_fn="cls",
-            learn_encoder=False,
-            use_attention_mask=True,
-            **kwargs, ):
+        self,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        project_dim=512,
+        pooler_fn="cls",
+        learn_encoder=False,
+        use_attention_mask=True,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
-            **kwargs, )
+            **kwargs,
+        )
         self.project_dim = project_dim
         self.pooler_fn = pooler_fn
         self.learn_encoder = learn_encoder
@@ -99,9 +98,7 @@ def __init__(
 
 class RobertaSeriesModelWithTransformation(RobertaPretrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"]
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids", r"predictions.decoder.bias"
-    ]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     base_model_prefix = "roberta"
     config_class = RobertaSeriesConfig
 
@@ -111,39 +108,35 @@ def __init__(self, config: RobertaSeriesConfig):
         # must reset _padding_idx
         self.roberta.embeddings.word_embeddings._padding_idx = None
         self.transformation = nn.Linear(config.hidden_size, config.project_dim)
-        self.has_pre_transformation = getattr(config, "has_pre_transformation",
-                                              False)
+        self.has_pre_transformation = getattr(config, "has_pre_transformation", False)
         if self.has_pre_transformation:
-            self.transformation_pre = nn.Linear(config.hidden_size,
-                                                config.project_dim)
-            self.pre_LN = nn.LayerNorm(
-                config.hidden_size, eps=config.layer_norm_eps)
+            self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim)
+            self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.init_weights()
 
     def forward(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            token_type_ids: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            return_dict: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None, ):
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if position_ids is None:
-            position_ids = create_position_ids_from_input_ids(
-                input_ids, self.config.pad_token_id)
+            position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
         outputs = self.base_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             output_attentions=output_attentions,
-            output_hidden_states=True
-            if self.has_pre_transformation else output_hidden_states,
-            return_dict=return_dict, )
+            output_hidden_states=True if self.has_pre_transformation else output_hidden_states,
+            return_dict=return_dict,
+        )
 
         if self.has_pre_transformation:
             sequence_output2 = outputs["hidden_states"][-2]
@@ -154,11 +147,13 @@ def forward(
                 projection_state=projection_state2,
                 last_hidden_state=outputs.last_hidden_state,
                 hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions, )
+                attentions=outputs.attentions,
+            )
         else:
             projection_state = self.transformation(outputs.last_hidden_state)
             return TransformationModelOutput(
                 projection_state=projection_state,
                 last_hidden_state=outputs.last_hidden_state,
                 hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions, )
+                attentions=outputs.attentions,
+            )
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index a610e38dbd5ac..0dee82d33981b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -24,8 +24,7 @@
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (deprecate, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
@@ -85,37 +84,33 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: RobertaSeriesModelWithTransformation,
-            tokenizer: XLMRobertaTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -123,11 +118,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -148,12 +139,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -164,12 +153,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -181,18 +167,20 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -232,29 +220,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because XLM-Roberta can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -262,8 +252,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -273,21 +262,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -295,46 +285,42 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -353,53 +339,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -412,22 +394,25 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -446,25 +431,25 @@ def prepare_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -545,7 +530,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -567,7 +553,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -582,43 +569,38 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -631,8 +613,7 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
@@ -641,11 +622,9 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return AltDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 313c4e5e2eca1..232d79d8da99a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -27,8 +27,13 @@
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, deprecate, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import (
+    PIL_INTERPOLATION,
+    deprecate,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
@@ -74,11 +79,7 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -90,8 +91,7 @@ def preprocess(image):
 
 
 # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
-class AltDiffusionImg2ImgPipeline(DiffusionPipeline,
-                                  TextualInversionLoaderMixin):
+class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
     Pipeline for text-guided image to image generation using Alt Diffusion.
 
@@ -128,37 +128,33 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: RobertaSeriesModelWithTransformation,
-            tokenizer: XLMRobertaTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -166,11 +162,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -191,12 +183,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -207,12 +197,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -224,21 +211,23 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.register_to_config(
-            requires_safety_checker=requires_safety_checker, )
+            requires_safety_checker=requires_safety_checker,
+        )
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -278,29 +267,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because XLM-Roberta can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -308,8 +299,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -319,21 +309,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -341,36 +332,33 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -379,17 +367,14 @@ def run_safety_checker(self, image, dtype):
             has_nsfw_concept = None
         else:
             if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(
-                    image, output_type="pil")
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
             else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(
-                    image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="pd")
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
                 images=image,
-                clip_input=paddle.cast(safety_checker_input.pixel_values,
-                                       dtype), )
+                clip_input=paddle.cast(safety_checker_input.pixel_values, dtype),
+            )
         return image, has_nsfw_concept
 
     def decode_latents(self, latents):
@@ -404,51 +389,48 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            strength,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -461,25 +443,19 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        num_images_per_prompt,
-                        dtype,
-                        generator=None):
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -496,8 +472,7 @@ def prepare_latents(self,
 
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(init_latents, axis=0)
         else:
@@ -505,8 +480,7 @@ def prepare_latents(self,
 
         init_latents = self.vae.config.scaling_factor * init_latents
 
-        if (batch_size > init_latents.shape[0] and
-                batch_size % init_latents.shape[0] == 0):
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
             deprecation_message = (
                 f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -518,12 +492,11 @@ def prepare_latents(self,
                 "len(prompt) != len(image)",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat(
-                [init_latents] * additional_image_per_prompt, axis=0)
-        elif (batch_size > init_latents.shape[0] and
-              batch_size % init_latents.shape[0] != 0):
+            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -542,24 +515,24 @@ def prepare_latents(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -636,7 +609,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -657,17 +631,16 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Preprocess image
         image = self.image_processor.preprocess(image)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            (batch_size * num_images_per_prompt, ))
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile((batch_size * num_images_per_prompt,))
 
         # 6. Prepare latent variables
         latents = self.prepare_latents(
@@ -676,51 +649,45 @@ def __call__(
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
         else:
             image = latents
             has_nsfw_concept = None
@@ -730,11 +697,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return AltDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
index ffe2c5bad7456..ca098c706711c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
@@ -25,7 +25,9 @@
     _import_error = ""
 except Exception as e:
     _librosa_can_be_imported = False
-    _import_error = f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+    _import_error = (
+        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+    )
 
 from PIL import Image  # noqa: E402
 
@@ -46,14 +48,15 @@ class Mel(ConfigMixin, SchedulerMixin):
 
     @register_to_config
     def __init__(
-            self,
-            x_res: int=256,
-            y_res: int=256,
-            sample_rate: int=22050,
-            n_fft: int=2048,
-            hop_length: int=512,
-            top_db: int=80,
-            n_iter: int=32, ):
+        self,
+        x_res: int = 256,
+        y_res: int = 256,
+        sample_rate: int = 22050,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        top_db: int = 80,
+        n_iter: int = 32,
+    ):
         self.hop_length = hop_length
         self.sr = sample_rate
         self.n_fft = n_fft
@@ -77,7 +80,7 @@ def set_resolution(self, x_res: int, y_res: int):
         self.n_mels = self.y_res
         self.slice_size = self.x_res * self.hop_length - 1
 
-    def load_audio(self, audio_file: str=None, raw_audio: np.ndarray=None):
+    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
         """Load audio.
 
         Args:
@@ -91,10 +94,12 @@ def load_audio(self, audio_file: str=None, raw_audio: np.ndarray=None):
 
         # Pad with silence if necessary.
         if len(self.audio) < self.x_res * self.hop_length:
-            self.audio = np.concatenate([
-                self.audio,
-                np.zeros((self.x_res * self.hop_length - len(self.audio), )),
-            ])
+            self.audio = np.concatenate(
+                [
+                    self.audio,
+                    np.zeros((self.x_res * self.hop_length - len(self.audio),)),
+                ]
+            )
 
     def get_number_of_slices(self) -> int:
         """Get number of slices in audio.
@@ -104,7 +109,7 @@ def get_number_of_slices(self) -> int:
         """
         return len(self.audio) // self.slice_size
 
-    def get_audio_slice(self, slice: int=0) -> np.ndarray:
+    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
         """Get slice of audio.
 
         Args:
@@ -113,7 +118,7 @@ def get_audio_slice(self, slice: int=0) -> np.ndarray:
         Returns:
             `np.ndarray`: audio as numpy array
         """
-        return self.audio[self.slice_size * slice:self.slice_size * (slice + 1)]
+        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
 
     def get_sample_rate(self) -> int:
         """Get sample rate:
@@ -137,11 +142,10 @@ def audio_slice_to_image(self, slice: int) -> Image.Image:
             sr=self.sr,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
-            n_mels=self.n_mels, )
+            n_mels=self.n_mels,
+        )
         log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
-        bytedata = ((
-            (log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5
-                    ).astype(np.uint8)
+        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
         image = Image.fromarray(bytedata)
         return image
 
@@ -154,8 +158,7 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray:
         Returns:
             audio (`np.ndarray`): raw audio
         """
-        bytedata = np.frombuffer(
-            image.tobytes(), dtype="uint8").reshape((image.height, image.width))
+        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
         log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
         S = librosa.db_to_power(log_S)
         audio = librosa.feature.inverse.mel_to_audio(
@@ -163,5 +166,6 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray:
             sr=self.sr,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
-            n_iter=self.n_iter, )
+            n_iter=self.n_iter,
+        )
         return audio
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
index 50b57cd936dac..581729f066b72 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
@@ -23,8 +23,12 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDIMScheduler, DDPMScheduler
 from ...utils import randn_tensor
-from ..pipeline_utils import (AudioPipelineOutput, BaseOutput,
-                              DiffusionPipeline, ImagePipelineOutput)
+from ..pipeline_utils import (
+    AudioPipelineOutput,
+    BaseOutput,
+    DiffusionPipeline,
+    ImagePipelineOutput,
+)
 from .mel import Mel
 
 
@@ -43,14 +47,14 @@ class AudioDiffusionPipeline(DiffusionPipeline):
     _optional_components = ["vqvae"]
 
     def __init__(
-            self,
-            vqvae: AutoencoderKL,
-            unet: UNet2DConditionModel,
-            mel: Mel,
-            scheduler: Union[DDIMScheduler, DDPMScheduler], ):
+        self,
+        vqvae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        mel: Mel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+    ):
         super().__init__()
-        self.register_modules(
-            unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
+        self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
 
     def get_input_dims(self) -> Tuple:
         """Returns dimension of input image
@@ -62,8 +66,9 @@ def get_input_dims(self) -> Tuple:
         # For backwards compatibility
         sample_size = (
             (input_module.config.sample_size, input_module.config.sample_size)
-            if type(input_module.config.sample_size) == int else
-            input_module.config.sample_size)
+            if type(input_module.config.sample_size) == int
+            else input_module.config.sample_size
+        )
         return sample_size
 
     def get_default_steps(self) -> int:
@@ -76,23 +81,25 @@ def get_default_steps(self) -> int:
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            audio_file: str=None,
-            raw_audio: np.ndarray=None,
-            slice: int=0,
-            start_step: int=0,
-            steps: int=None,
-            generator: paddle.Generator=None,
-            mask_start_secs: float=0,
-            mask_end_secs: float=0,
-            step_generator: paddle.Generator=None,
-            eta: float=0,
-            noise: paddle.Tensor=None,
-            encoding: paddle.Tensor=None,
-            return_dict=True, ) -> Union[Union[
-                AudioPipelineOutput, ImagePipelineOutput], Tuple[List[
-                    Image.Image], Tuple[int, List[np.ndarray]]], ]:
+        self,
+        batch_size: int = 1,
+        audio_file: str = None,
+        raw_audio: np.ndarray = None,
+        slice: int = 0,
+        start_step: int = 0,
+        steps: int = None,
+        generator: paddle.Generator = None,
+        mask_start_secs: float = 0,
+        mask_end_secs: float = 0,
+        step_generator: paddle.Generator = None,
+        eta: float = 0,
+        noise: paddle.Tensor = None,
+        encoding: paddle.Tensor = None,
+        return_dict=True,
+    ) -> Union[
+        Union[AudioPipelineOutput, ImagePipelineOutput],
+        Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]],
+    ]:
         """Generate random mel spectrogram from audio input and convert to audio.
 
         Args:
@@ -122,7 +129,8 @@ def __call__(
         if type(self.unet.config.sample_size) == int:
             self.unet.config.sample_size = (
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, )
+                self.unet.config.sample_size,
+            )
         input_dims = self.get_input_dims()
         self.mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0])
         if noise is None:
@@ -131,44 +139,43 @@ def __call__(
                     batch_size,
                     self.unet.config.in_channels,
                     self.unet.config.sample_size[0],
-                    self.unet.config.sample_size[1], ),
-                generator=generator, )
+                    self.unet.config.sample_size[1],
+                ),
+                generator=generator,
+            )
         images = noise
         mask = None
 
         if audio_file is not None or raw_audio is not None:
             self.mel.load_audio(audio_file, raw_audio)
             input_image = self.mel.audio_slice_to_image(slice)
-            input_image = np.frombuffer(
-                input_image.tobytes(), dtype="uint8").reshape(
-                    (input_image.height, input_image.width))
+            input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
+                (input_image.height, input_image.width)
+            )
             input_image = (input_image / 255) * 2 - 1
-            input_images = paddle.to_tensor(
-                input_image[np.newaxis, :, :], dtype=paddle.float32)
+            input_images = paddle.to_tensor(input_image[np.newaxis, :, :], dtype=paddle.float32)
 
             if self.vqvae is not None:
-                input_images = self.vqvae.encode(
-                    paddle.unsqueeze(input_images, 0)).latent_dist.sample(
-                        generator=generator)[0]
+                input_images = self.vqvae.encode(paddle.unsqueeze(input_images, 0)).latent_dist.sample(
+                    generator=generator
+                )[0]
                 input_images = self.vqvae.config.scaling_factor * input_images
 
             if start_step > 0:
-                images[0, 0] = self.scheduler.add_noise(
-                    input_images, noise,
-                    self.scheduler.timesteps[start_step - 1])
+                images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
 
-            pixels_per_second = (self.unet.config.sample_size[1] *
-                                 self.mel.get_sample_rate() / self.mel.x_res /
-                                 self.mel.hop_length)
+            pixels_per_second = (
+                self.unet.config.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length
+            )
             mask_start = int(mask_start_secs * pixels_per_second)
             mask_end = int(mask_end_secs * pixels_per_second)
             mask = self.scheduler.add_noise(
                 input_images,
                 noise,
-                paddle.to_tensor(self.scheduler.timesteps[start_step:]), )
+                paddle.to_tensor(self.scheduler.timesteps[start_step:]),
+            )
 
-        for step, t in enumerate(
-                self.progress_bar(self.scheduler.timesteps[start_step:])):
+        for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
             if isinstance(self.unet, UNet2DConditionModel):
                 model_output = self.unet(images, t, encoding)["sample"]
             else:
@@ -180,13 +187,15 @@ def __call__(
                     timestep=t,
                     sample=images,
                     eta=eta,
-                    generator=step_generator, )["prev_sample"]
+                    generator=step_generator,
+                )["prev_sample"]
             else:
                 images = self.scheduler.step(
                     model_output=model_output,
                     timestep=t,
                     sample=images,
-                    generator=step_generator, )["prev_sample"]
+                    generator=step_generator,
+                )["prev_sample"]
 
             if mask is not None:
                 if mask_start > 0:
@@ -202,20 +211,20 @@ def __call__(
         images = (images / 2 + 0.5).clip(0, 1)
         images = images.transpose([0, 2, 3, 1]).cast("float32").numpy()
         images = (images * 255).round().astype("uint8")
-        images = list((Image.fromarray(_[:, :, 0]) for _ in images)
-                      if images.shape[3] == 1 else (Image.fromarray(
-                          _, mode="RGB").convert("L") for _ in images))
+        images = list(
+            (Image.fromarray(_[:, :, 0]) for _ in images)
+            if images.shape[3] == 1
+            else (Image.fromarray(_, mode="RGB").convert("L") for _ in images)
+        )
 
         audios = [self.mel.image_to_audio(_) for _ in images]
         if not return_dict:
             return images, (self.mel.get_sample_rate(), audios)
 
-        return BaseOutput(
-            **AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]),
-            **ImagePipelineOutput(images))
+        return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
 
     @paddle.no_grad()
-    def encode(self, images: List[Image.Image], steps: int=50) -> np.ndarray:
+    def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
         """Reverse step process: recover noisy image from generated image.
 
         Args:
@@ -229,36 +238,30 @@ def encode(self, images: List[Image.Image], steps: int=50) -> np.ndarray:
         # Only works with DDIM as this method is deterministic
         assert isinstance(self.scheduler, DDIMScheduler)
         self.scheduler.set_timesteps(steps)
-        sample = np.array([
-            np.frombuffer(
-                image.tobytes(), dtype="uint8").reshape(
-                    (1, image.height, image.width)) for image in images
-        ])
+        sample = np.array(
+            [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
+        )
         sample = (sample / 255) * 2 - 1
         sample = paddle.to_tensor(sample)
 
-        for t in self.progress_bar(
-                paddle.flip(self.scheduler.timesteps, (0, ))):
-            prev_timestep = (t - self.scheduler.num_train_timesteps //
-                             self.scheduler.num_inference_steps)
+        for t in self.progress_bar(paddle.flip(self.scheduler.timesteps, (0,))):
+            prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps
             alpha_prod_t = self.scheduler.alphas_cumprod[t]
-            alpha_prod_t_prev = (self.scheduler.alphas_cumprod[prev_timestep]
-                                 if prev_timestep >= 0 else
-                                 self.scheduler.final_alpha_cumprod)
+            alpha_prod_t_prev = (
+                self.scheduler.alphas_cumprod[prev_timestep]
+                if prev_timestep >= 0
+                else self.scheduler.final_alpha_cumprod
+            )
             beta_prod_t = 1 - alpha_prod_t
             model_output = self.unet(sample, t)["sample"]
-            pred_sample_direction = (1 - alpha_prod_t_prev)**(
-                0.5) * model_output
-            sample = (sample - pred_sample_direction) * alpha_prod_t_prev**(
-                -0.5)
-            sample = (sample * alpha_prod_t**(0.5) + beta_prod_t**
-                      (0.5) * model_output)
+            pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
+            sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
+            sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
 
         return sample
 
     @staticmethod
-    def slerp(x0: paddle.Tensor, x1: paddle.Tensor,
-              alpha: float) -> paddle.Tensor:
+    def slerp(x0: paddle.Tensor, x1: paddle.Tensor, alpha: float) -> paddle.Tensor:
         """Spherical Linear intERPolation
 
         Args:
@@ -270,8 +273,5 @@ def slerp(x0: paddle.Tensor, x1: paddle.Tensor,
             `paddle.Tensor`: interpolated tensor
         """
 
-        theta = acos(
-            paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) /
-            paddle.norm(x0) / paddle.norm(x1))
-        return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(
-            alpha * theta) * x1 / sin(theta)
+        theta = acos(paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) / paddle.norm(x0) / paddle.norm(x1))
+        return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)
diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
index 87a892da4d792..4ab25efc20003 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
-                      is_paddlenlp_available, is_paddlenlp_version)
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_paddlenlp_available,
+    is_paddlenlp_version,
+)
 
 try:
-    if not (is_paddlenlp_available() and is_paddle_available() and
-            is_paddlenlp_version(">=", "2.5.2")):
+    if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.5.2")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_and_paddlenlp_objects import AudioLDMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
index 0ba945ffdf429..8354d5e18ad8b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -18,8 +18,11 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.transformers import (ClapTextModelWithProjection,
-                                    RobertaTokenizer, SpeechT5HifiGan)
+from paddlenlp.transformers import (
+    ClapTextModelWithProjection,
+    RobertaTokenizer,
+    SpeechT5HifiGan,
+)
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -65,13 +68,14 @@ class AudioLDMPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: ClapTextModelWithProjection,
-            tokenizer: RobertaTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            vocoder: SpeechT5HifiGan, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ClapTextModelWithProjection,
+        tokenizer: RobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
         super().__init__()
         self.register_modules(
             vae=vae,
@@ -79,17 +83,19 @@ def __init__(
             tokenizer=tokenizer,
             unet=unet,
             scheduler=scheduler,
-            vocoder=vocoder, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_waveforms_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         """
         Encodes the prompt into text encoder hidden states.
 
@@ -113,13 +119,13 @@ def _encode_prompt(
                 argument.
         """
         if self.text_encoder.text_model.embeddings.token_type_ids.dtype not in [
-                paddle.int16,
-                paddle.int32,
-                paddle.int64,
+            paddle.int16,
+            paddle.int32,
+            paddle.int64,
         ]:
             self.text_encoder.text_model.embeddings.token_type_ids = (
-                self.text_encoder.text_model.embeddings.token_type_ids.cast(
-                    "int32"))
+                self.text_encoder.text_model.embeddings.token_type_ids.cast("int32")
+            )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -134,34 +140,35 @@ def _encode_prompt(
                 max_length=self.tokenizer.model_max_length,
                 return_attention_mask=True,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
             attention_mask = text_inputs.attention_mask
             untruncated_ids = self.tokenizer(
                 prompt,
                 padding="longest",
                 return_tensors="pd",
-                return_attention_mask=True, ).input_ids
-            if (untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and
-                    not paddle.equal_all(
-                        x=text_input_ids, y=untruncated_ids).item()):
+                return_attention_mask=True,
+            ).input_ids
+            if (
+                untruncated_ids.shape[-1] >= text_input_ids.shape[-1]
+                and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item()
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     f"The following part of your input was truncated because CLAP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
-            prompt_embeds = self.text_encoder(
-                text_input_ids.cast("int32"), attention_mask=attention_mask)
+            prompt_embeds = self.text_encoder(text_input_ids.cast("int32"), attention_mask=attention_mask)
             prompt_embeds = prompt_embeds.text_embeds
             # additional L_2 normalization over each hidden-state
             prompt_embeds = F.normalize(x=prompt_embeds, axis=-1)
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
         bs_embed, seq_len = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile(
-            repeat_times=[1, num_waveforms_per_prompt])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_waveforms_per_prompt, seq_len])
+        prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_waveforms_per_prompt, seq_len])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -187,33 +194,28 @@ def _encode_prompt(
                 max_length=max_length,
                 truncation=True,
                 return_tensors="pd",
-                return_attention_mask=True, )
+                return_attention_mask=True,
+            )
             uncond_input_ids = uncond_input.input_ids
             attention_mask = uncond_input.attention_mask
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input_ids.cast("int32"), attention_mask=attention_mask)
+            negative_prompt_embeds = self.text_encoder(uncond_input_ids.cast("int32"), attention_mask=attention_mask)
             negative_prompt_embeds = negative_prompt_embeds.text_embeds
             # additional L_2 normalization over each hidden-state
-            negative_prompt_embeds = F.normalize(
-                x=negative_prompt_embeds, axis=-1)
+            negative_prompt_embeds = F.normalize(x=negative_prompt_embeds, axis=-1)
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                repeat_times=[1, num_waveforms_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_waveforms_per_prompt, seq_len])
+            negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_waveforms_per_prompt, seq_len])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                x=[negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
         return prompt_embeds
 
     def decode_latents(self, latents):
@@ -235,28 +237,27 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            audio_length_in_s,
-            vocoder_upsample_factor,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
         if audio_length_in_s < min_audio_length_in_s:
             raise ValueError(
@@ -266,8 +267,11 @@ def check_inputs(
             raise ValueError(
                 f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of {self.vae_scale_factor}."
             )
-        if (callback_steps is None or callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+        if (
+            callback_steps is None
+            or callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
             )
@@ -279,11 +283,8 @@ def check_inputs(
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
@@ -294,18 +295,13 @@ def check_inputs(
                     f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
                 )
 
-    def prepare_latents(self,
-                        batch_size,
-                        num_channels_latents,
-                        height,
-                        dtype,
-                        generator,
-                        latents=None):
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, generator, latents=None):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            self.vocoder.config.model_in_dim // self.vae_scale_factor, )
+            self.vocoder.config.model_in_dim // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
@@ -322,24 +318,24 @@ def prepare_latents(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            audio_length_in_s: Optional[float]=None,
-            num_inference_steps: int=10,
-            guidance_scale: float=2.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_waveforms_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            output_type: Optional[str]="np", ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 10,
+        guidance_scale: float = 2.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -406,18 +402,13 @@ def __call__(
             When returning a tuple, the first element is a list with the generated audios.
         """
         # 0. Convert audio input length from seconds to spectrogram height
-        vocoder_upsample_factor = (np.prod(self.vocoder.config.upsample_rates) /
-                                   self.vocoder.config.sampling_rate)
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
         if audio_length_in_s is None:
-            audio_length_in_s = (self.unet.config.sample_size *
-                                 self.vae_scale_factor *
-                                 vocoder_upsample_factor)
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
         height = int(audio_length_in_s / vocoder_upsample_factor)
-        original_waveform_length = int(audio_length_in_s *
-                                       self.vocoder.config.sampling_rate)
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
         if height % self.vae_scale_factor != 0:
-            height = (int(np.ceil(height / self.vae_scale_factor)) *
-                      self.vae_scale_factor)
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
             logger.info(
                 f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} so that it can be handled by the model. It will be cut to {audio_length_in_s} after the denoising process."
             )
@@ -430,7 +421,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -452,7 +444,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -466,21 +459,19 @@ def __call__(
             height,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat(x=[latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
@@ -488,22 +479,19 @@ def __call__(
                     t,
                     encoder_hidden_states=None,
                     class_labels=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(
-                        chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -519,6 +507,6 @@ def __call__(
             audio = audio.numpy()
 
         if not return_dict:
-            return (audio, )
+            return (audio,)
 
         return AudioPipelineOutput(audios=audio)
diff --git a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index cc5f2a1b40f43..b4bc68019bf35 100644
--- a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -40,13 +40,13 @@ def __init__(self, unet, scheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            num_inference_steps: int=100,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            audio_length_in_s: Optional[float]=None,
-            return_dict: bool=True, ) -> Union[AudioPipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 100,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        audio_length_in_s: Optional[float] = None,
+        return_dict: bool = True,
+    ) -> Union[AudioPipelineOutput, Tuple]:
         """
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -67,18 +67,18 @@ def __call__(
             True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
         """
         if audio_length_in_s is None:
-            audio_length_in_s = (self.unet.config.sample_size /
-                                 self.unet.config.sample_rate)
+            audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
         sample_size = audio_length_in_s * self.unet.config.sample_rate
-        down_scale_factor = 2**len(self.unet.up_blocks)
+        down_scale_factor = 2 ** len(self.unet.up_blocks)
         if sample_size < 3 * down_scale_factor:
             raise ValueError(
                 f"{audio_length_in_s} is too small. Make sure it's bigger or equal to {3 * down_scale_factor / self.unet.config.sample_rate}."
             )
         original_sample_size = int(sample_size)
         if sample_size % down_scale_factor != 0:
-            sample_size = (audio_length_in_s * self.unet.config.sample_rate //
-                           down_scale_factor + 1) * down_scale_factor
+            sample_size = (
+                audio_length_in_s * self.unet.config.sample_rate // down_scale_factor + 1
+            ) * down_scale_factor
             logger.info(
                 f"{audio_length_in_s} is increased to {sample_size / self.unet.config.sample_rate} so that it can be handled by the model. It will be cut to {original_sample_size / self.unet.config.sample_rate} after the denoising process."
             )
@@ -105,5 +105,5 @@ def __call__(
         audio = audio.clip(min=-1, max=1).astype(dtype="float32").cpu().numpy()
         audio = audio[:, :, :original_sample_size]
         if not return_dict:
-            return (audio, )
+            return (audio,)
         return AudioPipelineOutput(audios=audio)
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
index ee8dbc0143053..2ffd3401ceb13 100644
--- a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
+++ b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
@@ -42,15 +42,15 @@ def __init__(self, unet, scheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            eta: float=0.0,
-            num_inference_steps: int=50,
-            use_clipped_model_output: Optional[bool]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        use_clipped_model_output: Optional[bool] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         """
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -82,19 +82,20 @@ def __call__(
                 batch_size,
                 self.unet.config.in_channels,
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, )
+                self.unet.config.sample_size,
+            )
         else:
             image_shape = (
                 batch_size,
                 self.unet.config.in_channels,
-                *self.unet.config.sample_size, )
+                *self.unet.config.sample_size,
+            )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        image = randn_tensor(
-            image_shape, generator=generator, dtype=self.unet.dtype)
+        image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
@@ -112,7 +113,8 @@ def __call__(
                 image,
                 eta=eta,
                 use_clipped_model_output=use_clipped_model_output,
-                generator=generator, ).prev_sample
+                generator=generator,
+            ).prev_sample
         image = (image / 2 + 0.5).clip(min=0, max=1)
         image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
 
@@ -120,5 +122,5 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
index cc73ea0e507a5..4ff2fe9a23bd9 100644
--- a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -38,13 +38,13 @@ def __init__(self, unet, scheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            num_inference_steps: int=1000,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        num_inference_steps: int = 1000,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         """
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -70,12 +70,14 @@ def __call__(
                 batch_size,
                 self.unet.config.in_channels,
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, )
+                self.unet.config.sample_size,
+            )
         else:
             image_shape = (
                 batch_size,
                 self.unet.config.in_channels,
-                *self.unet.config.sample_size, )
+                *self.unet.config.sample_size,
+            )
         image = randn_tensor(image_shape, generator=generator)
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
@@ -84,12 +86,11 @@ def __call__(
             model_output = self.unet(image, t).sample
 
             # 2. compute previous image: x_t -> x_t-1
-            image = self.scheduler.step(
-                model_output, t, image, generator=generator).prev_sample
+            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
         image = (image / 2 + 0.5).clip(min=0, max=1)
         image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
         if output_type == "pil":
             image = self.numpy_to_pil(image)
         if not return_dict:
-            return (image, )
+            return (image,)
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
index ca49b436b3f91..fccb87f08b7b7 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
@@ -18,12 +18,22 @@
 import numpy as np
 import PIL
 
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
-                      is_paddle_available, is_paddlenlp_available)
-from .timesteps import (fast27_timesteps, smart27_timesteps, smart50_timesteps,
-                        smart100_timesteps, smart185_timesteps,
-                        super27_timesteps, super40_timesteps,
-                        super100_timesteps)
+from ...utils import (
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
+from .timesteps import (
+    fast27_timesteps,
+    smart27_timesteps,
+    smart50_timesteps,
+    smart100_timesteps,
+    smart185_timesteps,
+    super27_timesteps,
+    super40_timesteps,
+    super100_timesteps,
+)
 
 
 @dataclass
@@ -55,11 +65,11 @@ class IFPipelineOutput(BaseOutput):
 else:
     from .pipeline_if import IFPipeline
     from .pipeline_if_img2img import IFImg2ImgPipeline
-    from .pipeline_if_img2img_superresolution import \
-        IFImg2ImgSuperResolutionPipeline
+    from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
     from .pipeline_if_inpainting import IFInpaintingPipeline
-    from .pipeline_if_inpainting_superresolution import \
-        IFInpaintingSuperResolutionPipeline
+    from .pipeline_if_inpainting_superresolution import (
+        IFInpaintingSuperResolutionPipeline,
+    )
     from .pipeline_if_superresolution import IFSuperResolutionPipeline
     from .safety_checker import IFSafetyChecker
     from .watermark import IFWatermarker
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
index 787a25590a6e1..2a7c3bddcaedd 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -19,14 +19,19 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
-                                    T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
 
 from ...loaders import LoraLoaderMixin
 from ...models import UNet2DConditionModel
 from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, is_bs4_available, is_ftfy_available,
-                      logging, randn_tensor, replace_example_docstring)
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
@@ -101,8 +106,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
     watermarker: Optional[IFWatermarker]
 
     bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
-        "\|" + "\\" + "\/" + "\*" + r"]{1,}")  # noqa
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
 
     _optional_components = [
         "tokenizer",
@@ -113,15 +118,16 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
     ]
 
     def __init__(
-            self,
-            tokenizer: T5Tokenizer,
-            text_encoder: T5EncoderModel,
-            unet: UNet2DConditionModel,
-            scheduler: DDPMScheduler,
-            safety_checker: Optional[IFSafetyChecker],
-            feature_extractor: Optional[CLIPImageProcessor],
-            watermarker: Optional[IFWatermarker],
-            requires_safety_checker: bool=True, ):
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -147,19 +153,21 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            watermarker=watermarker, )
+            watermarker=watermarker,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @paddle.no_grad()
     def encode_prompt(
-            self,
-            prompt,
-            do_classifier_free_guidance=True,
-            num_images_per_prompt=1,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            clean_caption: bool=False, ):
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        clean_caption: bool = False,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -186,7 +194,8 @@ def encode_prompt(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -199,31 +208,31 @@ def encode_prompt(
         max_length = 77
 
         if prompt_embeds is None:
-            prompt = self._text_preprocessing(
-                prompt, clean_caption=clean_caption)
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, max_length - 1:-1])
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}")
+                    f" {max_length} tokens: {removed_text}"
+                )
 
             attention_mask = text_inputs.attention_mask
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         if self.text_encoder is not None:
@@ -238,8 +247,7 @@ def encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -252,12 +260,12 @@ def encode_prompt(
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_tokens = self._text_preprocessing(
-                uncond_tokens, clean_caption=clean_caption)
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -266,12 +274,14 @@ def encode_prompt(
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = uncond_input.attention_mask
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
@@ -281,10 +291,8 @@ def encode_prompt(
             if dtype is not None:
                 negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -296,11 +304,11 @@ def encode_prompt(
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, nsfw_detected, watermark_detected = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(dtype),
+            )
         else:
             nsfw_detected = None
             watermark_detected = None
@@ -314,46 +322,44 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -366,10 +372,10 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
-    def prepare_intermediate_images(self, batch_size, num_channels, height,
-                                    width, dtype, generator):
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator):
         shape = (batch_size, num_channels, height, width)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
@@ -377,8 +383,7 @@ def prepare_intermediate_images(self, batch_size, num_channels, height,
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        intermediate_images = randn_tensor(
-            shape, generator=generator, dtype=dtype)
+        intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype)
 
         # scale the initial noise by the standard deviation required by the scheduler
         intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
@@ -386,14 +391,12 @@ def prepare_intermediate_images(self, batch_size, num_channels, height,
 
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
@@ -419,11 +422,13 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         caption = re.sub(
             r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         # html:
         caption = BeautifulSoup(caption, features="html.parser").text
 
@@ -450,7 +455,8 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
             "-",
-            caption, )
+            caption,
+        )
 
         # кавычки к одному стандарту
         caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -477,15 +483,13 @@ def _clean_caption(self, caption):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
-                         "", caption)
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
 
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
 
-        caption = re.sub(self.bad_punct_regex, r" ",
-                         caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
 
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -503,13 +507,10 @@ def _clean_caption(self, caption):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(
-            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
-            caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
 
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
-                         caption)  # j2d1a2a...
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
 
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
 
@@ -529,26 +530,26 @@ def _clean_caption(self, caption):
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            num_inference_steps: int=100,
-            timesteps: List[int]=None,
-            guidance_scale: float=7.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            clean_caption: bool=True,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -625,7 +626,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         height = height or self.unet.config.sample_size
@@ -651,11 +653,11 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption, )
+            clean_caption=clean_caption,
+        )
 
         if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         # 4. Prepare timesteps
         if timesteps is not None:
@@ -673,19 +675,19 @@ def __call__(
             height,
             width,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                model_input = (paddle.concat([intermediate_images] * 2)
-                               if do_classifier_free_guidance else
-                               intermediate_images)
+                model_input = (
+                    paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
                 model_input = self.scheduler.scale_model_input(model_input, t)
 
                 # predict the noise residual
@@ -694,7 +696,8 @@ def __call__(
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -704,27 +707,28 @@ def __call__(
                             model_input.shape[1],
                             noise_pred_uncond.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
                     noise_pred_text, predicted_variance = noise_pred_text.split(
                         [
                             model_input.shape[1],
                             noise_pred_text.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat(
-                        [noise_pred, predicted_variance], axis=1)
+                        axis=1,
+                    )
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
                 if self.scheduler.config.variance_type not in [
-                        "learned",
-                        "learned_range",
+                    "learned",
+                    "learned_range",
                 ]:
                     noise_pred, _ = noise_pred.split(
                         [
                             model_input.shape[1],
                             noise_pred_uncond.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
 
                 # compute the previous noisy sample x_t -> x_t-1
                 intermediate_images = self.scheduler.step(
@@ -732,12 +736,11 @@ def __call__(
                     t,
                     intermediate_images,
                     **extra_step_kwargs,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, intermediate_images)
@@ -750,16 +753,14 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
 
             # 11. Apply watermark
             if self.watermarker is not None:
-                image = self.watermarker.apply_watermark(
-                    image, self.unet.config.sample_size)
+                image = self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         elif output_type == "pd":
             nsfw_detected = None
             watermark_detected = None
@@ -770,8 +771,7 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
@@ -779,4 +779,5 @@ def __call__(
         return IFPipelineOutput(
             images=image,
             nsfw_detected=nsfw_detected,
-            watermark_detected=watermark_detected, )
+            watermark_detected=watermark_detected,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index 7fa08748a3d86..30df336ebed8c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -21,14 +21,19 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
-                                    T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
 
 from ...models import UNet2DConditionModel
 from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
-                      is_ftfy_available, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
@@ -55,8 +60,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
     else:
         h = int(round(img_size / 8 / coef) * 8)
 
-    images = images.resize(
-        (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
 
     return images
 
@@ -127,8 +131,8 @@ class IFImg2ImgPipeline(DiffusionPipeline):
     watermarker: Optional[IFWatermarker]
 
     bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
-        "\|" + "\\" + "\/" + "\*" + r"]{1,}")  # noqa
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
 
     _optional_components = [
         "tokenizer",
@@ -139,15 +143,16 @@ class IFImg2ImgPipeline(DiffusionPipeline):
     ]
 
     def __init__(
-            self,
-            tokenizer: T5Tokenizer,
-            text_encoder: T5EncoderModel,
-            unet: UNet2DConditionModel,
-            scheduler: DDPMScheduler,
-            safety_checker: Optional[IFSafetyChecker],
-            feature_extractor: Optional[CLIPImageProcessor],
-            watermarker: Optional[IFWatermarker],
-            requires_safety_checker: bool=True, ):
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -173,20 +178,22 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            watermarker=watermarker, )
+            watermarker=watermarker,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @paddle.no_grad()
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
     def encode_prompt(
-            self,
-            prompt,
-            do_classifier_free_guidance=True,
-            num_images_per_prompt=1,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            clean_caption: bool=False, ):
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        clean_caption: bool = False,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -213,7 +220,8 @@ def encode_prompt(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -226,31 +234,31 @@ def encode_prompt(
         max_length = 77
 
         if prompt_embeds is None:
-            prompt = self._text_preprocessing(
-                prompt, clean_caption=clean_caption)
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, max_length - 1:-1])
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}")
+                    f" {max_length} tokens: {removed_text}"
+                )
             attention_mask = text_inputs.attention_mask
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         if self.text_encoder is not None:
@@ -265,8 +273,7 @@ def encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -279,12 +286,12 @@ def encode_prompt(
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_tokens = self._text_preprocessing(
-                uncond_tokens, clean_caption=clean_caption)
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -293,12 +300,14 @@ def encode_prompt(
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = uncond_input.attention_mask
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
@@ -308,10 +317,8 @@ def encode_prompt(
             if dtype is not None:
                 negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -324,11 +331,11 @@ def encode_prompt(
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, nsfw_detected, watermark_detected = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(dtype),
+            )
         else:
             nsfw_detected = None
             watermark_detected = None
@@ -342,48 +349,46 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            batch_size,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -396,19 +401,23 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         if isinstance(image, list):
             check_image_type = image[0]
         else:
             check_image_type = image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(image, list):
             image_batch_size = len(image)
@@ -422,21 +431,17 @@ def check_inputs(
             assert False
 
         if batch_size != image_batch_size:
-            raise ValueError(
-                f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
@@ -463,11 +468,13 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         caption = re.sub(
             r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         # html:
         caption = BeautifulSoup(caption, features="html.parser").text
 
@@ -494,7 +501,8 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
             "-",
-            caption, )
+            caption,
+        )
 
         # кавычки к одному стандарту
         caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -521,15 +529,13 @@ def _clean_caption(self, caption):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
-                         "", caption)
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
 
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
 
-        caption = re.sub(self.bad_punct_regex, r" ",
-                         caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
 
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -547,13 +553,10 @@ def _clean_caption(self, caption):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(
-            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
-            caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
 
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
-                         caption)  # j2d1a2a...
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
 
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
 
@@ -598,35 +601,24 @@ def numpy_to_pd(images):
             image = numpy_to_pd(image)  # to pd
 
         elif isinstance(image[0], np.ndarray):
-            image = (np.concatenate(
-                image, axis=0) if image[0].ndim == 4 else np.stack(
-                    image, axis=0))
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             image = numpy_to_pd(image)
 
         elif isinstance(image[0], paddle.Tensor):
-            image = (paddle.concat(
-                image, axis=0) if image[0].ndim == 4 else paddle.stack(
-                    image, axis=0))
+            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
 
         return image
 
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_intermediate_images(self,
-                                    image,
-                                    timestep,
-                                    batch_size,
-                                    num_images_per_prompt,
-                                    dtype,
-                                    generator=None):
+    def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         _, channels, height, width = image.shape
 
         batch_size = batch_size * num_images_per_prompt
@@ -649,27 +641,33 @@ def prepare_intermediate_images(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
-                PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
-            strength: float=0.7,
-            num_inference_steps: int=80,
-            timesteps: List[int]=None,
-            guidance_scale: float=10.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            clean_caption: bool=True,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image,
+            paddle.Tensor,
+            np.ndarray,
+            List[PIL.Image.Image],
+            List[paddle.Tensor],
+            List[np.ndarray],
+        ] = None,
+        strength: float = 0.7,
+        num_inference_steps: int = 80,
+        timesteps: List[int] = None,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -760,7 +758,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -775,11 +774,11 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption, )
+            clean_caption=clean_caption,
+        )
 
         if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         dtype = prompt_embeds.dtype
 
@@ -792,32 +791,29 @@ def __call__(
             self.scheduler.set_timesteps(num_inference_steps)
             timesteps = self.scheduler.timesteps
 
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
 
         # 5. Prepare intermediate images
         image = self.preprocess_image(image)
         image = image.cast(dtype)
 
         noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile(
-            (batch_size * num_images_per_prompt, ))
+        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
 
         intermediate_images = self.prepare_intermediate_images(
-            image, noise_timestep, batch_size, num_images_per_prompt, dtype,
-            generator)
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, generator
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                model_input = (paddle.concat([intermediate_images] * 2)
-                               if do_classifier_free_guidance else
-                               intermediate_images)
+                model_input = (
+                    paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
                 model_input = self.scheduler.scale_model_input(model_input, t)
 
                 # predict the noise residual
@@ -825,7 +821,8 @@ def __call__(
                     model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -835,27 +832,25 @@ def __call__(
                             model_input.shape[1],
                             noise_pred_uncond.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
                     noise_pred_text, predicted_variance = noise_pred_text.split(
                         [
                             model_input.shape[1],
                             noise_pred_text.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat(
-                        [noise_pred, predicted_variance], axis=1)
+                        axis=1,
+                    )
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, intermediate_images)
@@ -868,16 +863,14 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
 
             # 11. Apply watermark
             if self.watermarker is not None:
-                self.watermarker.apply_watermark(image,
-                                                 self.unet.config.sample_size)
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         elif output_type == "pd":
             nsfw_detected = None
             watermark_detected = None
@@ -888,8 +881,7 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
@@ -897,4 +889,5 @@ def __call__(
         return IFPipelineOutput(
             images=image,
             nsfw_detected=nsfw_detected,
-            watermark_detected=watermark_detected, )
+            watermark_detected=watermark_detected,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index 42dd7fa35fa27..63e586bf00e34 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -22,14 +22,19 @@
 import paddle
 import paddle.nn.functional as F
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
-                                    T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
 
 from ...models import UNet2DConditionModel
 from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
-                      is_ftfy_available, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
@@ -57,8 +62,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
     else:
         h = int(round(img_size / 8 / coef) * 8)
 
-    images = images.resize(
-        (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
 
     return images
 
@@ -130,8 +134,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline):
     watermarker: Optional[IFWatermarker]
 
     bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
-        "\|" + "\\" + "\/" + "\*" + r"]{1,}")  # noqa
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
 
     _optional_components = [
         "tokenizer",
@@ -141,16 +145,17 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline):
     ]
 
     def __init__(
-            self,
-            tokenizer: T5Tokenizer,
-            text_encoder: T5EncoderModel,
-            unet: UNet2DConditionModel,
-            scheduler: DDPMScheduler,
-            image_noising_scheduler: DDPMScheduler,
-            safety_checker: Optional[IFSafetyChecker],
-            feature_extractor: Optional[CLIPImageProcessor],
-            watermarker: Optional[IFWatermarker],
-            requires_safety_checker: bool=True, ):
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -182,20 +187,19 @@ def __init__(
             image_noising_scheduler=image_noising_scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            watermarker=watermarker, )
+            watermarker=watermarker,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
@@ -222,11 +226,13 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         caption = re.sub(
             r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         # html:
         caption = BeautifulSoup(caption, features="html.parser").text
 
@@ -253,7 +259,8 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
             "-",
-            caption, )
+            caption,
+        )
 
         # кавычки к одному стандарту
         caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -280,15 +287,13 @@ def _clean_caption(self, caption):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
-                         "", caption)
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
 
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
 
-        caption = re.sub(self.bad_punct_regex, r" ",
-                         caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
 
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -306,13 +311,10 @@ def _clean_caption(self, caption):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(
-            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
-            caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
 
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
-                         caption)  # j2d1a2a...
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
 
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
 
@@ -332,14 +334,15 @@ def _clean_caption(self, caption):
     @paddle.no_grad()
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
     def encode_prompt(
-            self,
-            prompt,
-            do_classifier_free_guidance=True,
-            num_images_per_prompt=1,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            clean_caption: bool=False, ):
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        clean_caption: bool = False,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -366,7 +369,8 @@ def encode_prompt(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -379,31 +383,31 @@ def encode_prompt(
         max_length = 77
 
         if prompt_embeds is None:
-            prompt = self._text_preprocessing(
-                prompt, clean_caption=clean_caption)
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, max_length - 1:-1])
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}")
+                    f" {max_length} tokens: {removed_text}"
+                )
             attention_mask = text_inputs.attention_mask
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         if self.text_encoder is not None:
@@ -419,8 +423,7 @@ def encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -433,12 +436,12 @@ def encode_prompt(
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_tokens = self._text_preprocessing(
-                uncond_tokens, clean_caption=clean_caption)
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -447,12 +450,14 @@ def encode_prompt(
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = uncond_input.attention_mask
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
@@ -461,10 +466,8 @@ def encode_prompt(
             if dtype is not None:
                 negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -477,11 +480,11 @@ def encode_prompt(
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, nsfw_detected, watermark_detected = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(dtype),
+            )
         else:
             nsfw_detected = None
             watermark_detected = None
@@ -495,49 +498,47 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            original_image,
-            batch_size,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        image,
+        original_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -550,7 +551,8 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         # image
 
@@ -559,12 +561,15 @@ def check_inputs(
         else:
             check_image_type = image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(image, list):
             image_batch_size = len(image)
@@ -578,9 +583,7 @@ def check_inputs(
             assert False
 
         if batch_size != image_batch_size:
-            raise ValueError(
-                f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
 
         # original_image
 
@@ -589,12 +592,15 @@ def check_inputs(
         else:
             check_image_type = original_image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(original_image, list):
             image_batch_size = len(original_image)
@@ -613,8 +619,7 @@ def check_inputs(
             )
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
-    def preprocess_original_image(self,
-                                  image: PIL.Image.Image) -> paddle.Tensor:
+    def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor:
         if not isinstance(image, list):
             image = [image]
 
@@ -642,21 +647,16 @@ def numpy_to_pd(images):
             image = numpy_to_pd(image)  # to pd
 
         elif isinstance(image[0], np.ndarray):
-            image = (np.concatenate(
-                image, axis=0) if image[0].ndim == 4 else np.stack(
-                    image, axis=0))
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             image = numpy_to_pd(image)
 
         elif isinstance(image[0], paddle.Tensor):
-            image = (paddle.concat(
-                image, axis=0) if image[0].ndim == 4 else paddle.stack(
-                    image, axis=0))
+            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
 
         return image
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
-    def preprocess_image(self, image: PIL.Image.Image,
-                         num_images_per_prompt) -> paddle.Tensor:
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor:
         if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
             image = [image]
 
@@ -679,8 +679,7 @@ def preprocess_image(self, image: PIL.Image.Image,
             elif dims == 4:
                 image = paddle.concat(image, axis=0)
             else:
-                raise ValueError(
-                    f"Image must have 3 or 4 dimensions, instead got {dims}")
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
 
         image = image.cast(self.unet.dtype)
 
@@ -691,8 +690,7 @@ def preprocess_image(self, image: PIL.Image.Image,
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
@@ -700,13 +698,7 @@ def get_timesteps(self, num_inference_steps, strength):
         return timesteps, num_inference_steps - t_start
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images
-    def prepare_intermediate_images(self,
-                                    image,
-                                    timestep,
-                                    batch_size,
-                                    num_images_per_prompt,
-                                    dtype,
-                                    generator=None):
+    def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         _, channels, height, width = image.shape
 
         batch_size = batch_size * num_images_per_prompt
@@ -729,30 +721,35 @@ def prepare_intermediate_images(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
-            original_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray,
-                                  List[PIL.Image.Image], List[
-                                      paddle.Tensor], List[np.ndarray], ]=None,
-            strength: float=0.8,
-            prompt: Union[str, List[str]]=None,
-            num_inference_steps: int=50,
-            timesteps: List[int]=None,
-            guidance_scale: float=4.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            noise_level: int=250,
-            clean_caption: bool=True, ):
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
+        original_image: Union[
+            PIL.Image.Image,
+            paddle.Tensor,
+            np.ndarray,
+            List[PIL.Image.Image],
+            List[paddle.Tensor],
+            List[np.ndarray],
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -848,7 +845,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
 
@@ -865,11 +863,11 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption, )
+            clean_caption=clean_caption,
+        )
 
         if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         dtype = prompt_embeds.dtype
 
@@ -882,8 +880,7 @@ def __call__(
             self.scheduler.set_timesteps(num_inference_steps)
             timesteps = self.scheduler.timesteps
 
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
 
         # 5. prepare original image
         original_image = self.preprocess_original_image(original_image)
@@ -891,8 +888,7 @@ def __call__(
 
         # 6. Prepare intermediate images
         noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile(
-            (batch_size * num_images_per_prompt, ))
+        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
 
         intermediate_images = self.prepare_intermediate_images(
             original_image,
@@ -900,21 +896,19 @@ def __call__(
             batch_size,
             num_images_per_prompt,
             dtype,
-            generator, )
+            generator,
+        )
 
         # 7. Prepare upscaled image and noise level
         _, _, height, width = original_image.shape
 
         image = self.preprocess_image(image, num_images_per_prompt)
 
-        upscaled = F.interpolate(
-            image, (height, width), mode="bilinear", align_corners=True)
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
 
         noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
-        noise = randn_tensor(
-            upscaled.shape, generator=generator, dtype=upscaled.dtype)
-        upscaled = self.image_noising_scheduler.add_noise(
-            upscaled, noise, timesteps=noise_level)
+        noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
 
         if do_classifier_free_guidance:
             noise_level = paddle.concat([noise_level] * 2)
@@ -923,19 +917,15 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 model_input = paddle.concat(
-                    [
-                        intermediate_images,
-                        upscaled.cast(intermediate_images.dtype)
-                    ],
-                    axis=1, )
-
-                model_input = (paddle.concat([model_input] * 2)
-                               if do_classifier_free_guidance else model_input)
+                    [intermediate_images, upscaled.cast(intermediate_images.dtype)],
+                    axis=1,
+                )
+
+                model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
                 model_input = self.scheduler.scale_model_input(model_input, t)
 
                 # predict the noise residual
@@ -944,7 +934,8 @@ def __call__(
                     t,
                     encoder_hidden_states=prompt_embeds,
                     class_labels=noise_level,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -952,31 +943,27 @@ def __call__(
                     noise_pred_uncond, _ = noise_pred_uncond.split(
                         [
                             model_input.shape[1] // 2,
-                            noise_pred_uncond.shape[1] - model_input.shape[1] //
-                            2,
+                            noise_pred_uncond.shape[1] - model_input.shape[1] // 2,
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
                     noise_pred_text, predicted_variance = noise_pred_text.split(
                         [
                             model_input.shape[1] // 2,
-                            noise_pred_text.shape[1] - model_input.shape[1] //
-                            2,
+                            noise_pred_text.shape[1] - model_input.shape[1] // 2,
                         ],
-                        axis=1, )
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat(
-                        [noise_pred, predicted_variance], axis=1)
+                        axis=1,
+                    )
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, intermediate_images)
@@ -989,16 +976,14 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 12. Convert to PIL
             image = self.numpy_to_pil(image)
 
             # 13. Apply watermark
             if self.watermarker is not None:
-                self.watermarker.apply_watermark(image,
-                                                 self.unet.config.sample_size)
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         elif output_type == "pd":
             nsfw_detected = None
             watermark_detected = None
@@ -1008,8 +993,7 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
@@ -1017,4 +1001,5 @@ def __call__(
         return IFPipelineOutput(
             images=image,
             nsfw_detected=nsfw_detected,
-            watermark_detected=watermark_detected, )
+            watermark_detected=watermark_detected,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index 72fd143c156c2..5ff5992901c78 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -21,14 +21,19 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
-                                    T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
 
 from ...models import UNet2DConditionModel
 from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
-                      is_ftfy_available, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
@@ -56,8 +61,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
     else:
         h = int(round(img_size / 8 / coef) * 8)
 
-    images = images.resize(
-        (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
 
     return images
 
@@ -130,8 +134,8 @@ class IFInpaintingPipeline(DiffusionPipeline):
     watermarker: Optional[IFWatermarker]
 
     bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
-        "\|" + "\\" + "\/" + "\*" + r"]{1,}")  # noqa
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
 
     _optional_components = [
         "tokenizer",
@@ -142,15 +146,16 @@ class IFInpaintingPipeline(DiffusionPipeline):
     ]
 
     def __init__(
-            self,
-            tokenizer: T5Tokenizer,
-            text_encoder: T5EncoderModel,
-            unet: UNet2DConditionModel,
-            scheduler: DDPMScheduler,
-            safety_checker: Optional[IFSafetyChecker],
-            feature_extractor: Optional[CLIPImageProcessor],
-            watermarker: Optional[IFWatermarker],
-            requires_safety_checker: bool=True, ):
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -176,20 +181,22 @@ def __init__(
             scheduler=scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            watermarker=watermarker, )
+            watermarker=watermarker,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @paddle.no_grad()
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
     def encode_prompt(
-            self,
-            prompt,
-            do_classifier_free_guidance=True,
-            num_images_per_prompt=1,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            clean_caption: bool=False, ):
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        clean_caption: bool = False,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -216,7 +223,8 @@ def encode_prompt(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -229,32 +237,32 @@ def encode_prompt(
         max_length = 77
 
         if prompt_embeds is None:
-            prompt = self._text_preprocessing(
-                prompt, clean_caption=clean_caption)
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, max_length - 1:-1])
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}")
+                    f" {max_length} tokens: {removed_text}"
+                )
 
             attention_mask = text_inputs.attention_mask
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         if self.text_encoder is not None:
@@ -269,8 +277,7 @@ def encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -283,12 +290,12 @@ def encode_prompt(
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_tokens = self._text_preprocessing(
-                uncond_tokens, clean_caption=clean_caption)
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -297,12 +304,14 @@ def encode_prompt(
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = uncond_input.attention_mask
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
@@ -312,10 +321,8 @@ def encode_prompt(
             if dtype is not None:
                 negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -328,11 +335,11 @@ def encode_prompt(
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, nsfw_detected, watermark_detected = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(dtype),
+            )
         else:
             nsfw_detected = None
             watermark_detected = None
@@ -346,49 +353,47 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            mask_image,
-            batch_size,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -401,7 +406,8 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         # image
 
@@ -410,12 +416,15 @@ def check_inputs(
         else:
             check_image_type = image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(image, list):
             image_batch_size = len(image)
@@ -429,9 +438,7 @@ def check_inputs(
             assert False
 
         if batch_size != image_batch_size:
-            raise ValueError(
-                f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
 
         # mask_image
 
@@ -440,12 +447,15 @@ def check_inputs(
         else:
             check_image_type = mask_image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(mask_image, list):
             image_batch_size = len(mask_image)
@@ -466,14 +476,12 @@ def check_inputs(
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
@@ -500,11 +508,13 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         caption = re.sub(
             r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         # html:
         caption = BeautifulSoup(caption, features="html.parser").text
 
@@ -531,7 +541,8 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
             "-",
-            caption, )
+            caption,
+        )
 
         # кавычки к одному стандарту
         caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -558,15 +569,13 @@ def _clean_caption(self, caption):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
-                         "", caption)
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
 
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
 
-        caption = re.sub(self.bad_punct_regex, r" ",
-                         caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
 
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -584,13 +593,10 @@ def _clean_caption(self, caption):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(
-            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
-            caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
 
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
-                         caption)  # j2d1a2a...
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
 
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
 
@@ -636,15 +642,11 @@ def numpy_to_pd(images):
             image = numpy_to_pd(image)  # to pd
 
         elif isinstance(image[0], np.ndarray):
-            image = (np.concatenate(
-                image, axis=0) if image[0].ndim == 4 else np.stack(
-                    image, axis=0))
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             image = numpy_to_pd(image)
 
         elif isinstance(image[0], paddle.Tensor):
-            image = (paddle.concat(
-                image, axis=0) if image[0].ndim == 4 else paddle.stack(
-                    image, axis=0))
+            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
 
         return image
 
@@ -653,10 +655,9 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
             mask_image = [mask_image]
 
         if isinstance(mask_image[0], paddle.Tensor):
-            mask_image = (paddle.concat(
-                mask_image, axis=0)
-                          if mask_image[0].ndim == 4 else paddle.stack(
-                              mask_image, axis=0))
+            mask_image = (
+                paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0)
+            )
 
             if mask_image.ndim == 2:
                 # Batch and add channel dim for single mask
@@ -692,8 +693,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
             mask_image = paddle.to_tensor(mask_image)
 
         elif isinstance(mask_image[0], np.ndarray):
-            mask_image = np.concatenate(
-                [m[None, None, :] for m in mask_image], axis=0)
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
 
             mask_image[mask_image < 0.5] = 0
             mask_image[mask_image >= 0.5] = 1
@@ -704,8 +704,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
@@ -713,14 +712,15 @@ def get_timesteps(self, num_inference_steps, strength):
         return timesteps, num_inference_steps - t_start
 
     def prepare_intermediate_images(
-            self,
-            image,
-            timestep,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            mask_image,
-            generator=None, ):
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        mask_image,
+        generator=None,
+    ):
         image_batch_size, channels, height, width = image.shape
 
         batch_size = batch_size * num_images_per_prompt
@@ -745,29 +745,41 @@ def prepare_intermediate_images(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
-                PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
-            mask_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
-                PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
-            strength: float=1.0,
-            num_inference_steps: int=50,
-            timesteps: List[int]=None,
-            guidance_scale: float=7.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            clean_caption: bool=True,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image,
+            paddle.Tensor,
+            np.ndarray,
+            List[PIL.Image.Image],
+            List[paddle.Tensor],
+            List[np.ndarray],
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image,
+            paddle.Tensor,
+            np.ndarray,
+            List[PIL.Image.Image],
+            List[paddle.Tensor],
+            List[np.ndarray],
+        ] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -864,7 +876,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -879,11 +892,11 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption, )
+            clean_caption=clean_caption,
+        )
 
         if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         dtype = prompt_embeds.dtype
 
@@ -896,8 +909,7 @@ def __call__(
             self.scheduler.set_timesteps(num_inference_steps)
             timesteps = self.scheduler.timesteps
 
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
 
         # 5. Prepare intermediate images
         image = self.preprocess_image(image)
@@ -907,15 +919,12 @@ def __call__(
         mask_image = mask_image.cast(dtype)
 
         if mask_image.shape[0] == 1:
-            mask_image = mask_image.repeat_interleave(
-                batch_size * num_images_per_prompt, axis=0)
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0)
         else:
-            mask_image = mask_image.repeat_interleave(
-                num_images_per_prompt, axis=0)
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0)
 
         noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile(
-            (batch_size * num_images_per_prompt, ))
+        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
 
         intermediate_images = self.prepare_intermediate_images(
             image,
@@ -924,19 +933,19 @@ def __call__(
             num_images_per_prompt,
             dtype,
             mask_image,
-            generator, )
+            generator,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                model_input = (paddle.concat([intermediate_images] * 2)
-                               if do_classifier_free_guidance else
-                               intermediate_images)
+                model_input = (
+                    paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
                 model_input = self.scheduler.scale_model_input(model_input, t)
 
                 # predict the noise residual
@@ -944,7 +953,8 @@ def __call__(
                     model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -954,33 +964,29 @@ def __call__(
                             model_input.shape[1],
                             noise_pred_uncond.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
                     noise_pred_text, predicted_variance = noise_pred_text.split(
                         [
                             model_input.shape[1],
                             noise_pred_text.shape[1] - model_input.shape[1],
                         ],
-                        axis=1, )
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat(
-                        [noise_pred, predicted_variance], axis=1)
+                        axis=1,
+                    )
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 prev_intermediate_images = intermediate_images
 
                 intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
 
-                intermediate_images = (
-                    1 - mask_image
-                ) * prev_intermediate_images + mask_image * intermediate_images
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, intermediate_images)
@@ -993,16 +999,14 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
 
             # 11. Apply watermark
             if self.watermarker is not None:
-                self.watermarker.apply_watermark(image,
-                                                 self.unet.config.sample_size)
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         elif output_type == "pd":
             nsfw_detected = None
             watermark_detected = None
@@ -1013,8 +1017,7 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
@@ -1022,4 +1025,5 @@ def __call__(
         return IFPipelineOutput(
             images=image,
             nsfw_detected=nsfw_detected,
-            watermark_detected=watermark_detected, )
+            watermark_detected=watermark_detected,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index a9d271872306a..7b1c73e660a40 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -22,14 +22,19 @@
 import paddle
 import paddle.nn.functional as F
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
-                                    T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
 
 from ...models import UNet2DConditionModel
 from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available,
-                      is_ftfy_available, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
@@ -57,8 +62,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
     else:
         h = int(round(img_size / 8 / coef) * 8)
 
-    images = images.resize(
-        (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
 
     return images
 
@@ -132,8 +136,8 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline):
     watermarker: Optional[IFWatermarker]
 
     bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
-        "\|" + "\\" + "\/" + "\*" + r"]{1,}")  # noqa
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
 
     _optional_components = [
         "tokenizer",
@@ -144,16 +148,17 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline):
     ]
 
     def __init__(
-            self,
-            tokenizer: T5Tokenizer,
-            text_encoder: T5EncoderModel,
-            unet: UNet2DConditionModel,
-            scheduler: DDPMScheduler,
-            image_noising_scheduler: DDPMScheduler,
-            safety_checker: Optional[IFSafetyChecker],
-            feature_extractor: Optional[CLIPImageProcessor],
-            watermarker: Optional[IFWatermarker],
-            requires_safety_checker: bool=True, ):
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -185,20 +190,19 @@ def __init__(
             image_noising_scheduler=image_noising_scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            watermarker=watermarker, )
+            watermarker=watermarker,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
@@ -225,11 +229,13 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         caption = re.sub(
             r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         # html:
         caption = BeautifulSoup(caption, features="html.parser").text
 
@@ -256,7 +262,8 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
             "-",
-            caption, )
+            caption,
+        )
 
         # кавычки к одному стандарту
         caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -283,15 +290,13 @@ def _clean_caption(self, caption):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
-                         "", caption)
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
 
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
 
-        caption = re.sub(self.bad_punct_regex, r" ",
-                         caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
 
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -309,13 +314,10 @@ def _clean_caption(self, caption):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(
-            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
-            caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
 
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
-                         caption)  # j2d1a2a...
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
 
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
 
@@ -335,14 +337,15 @@ def _clean_caption(self, caption):
     @paddle.no_grad()
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
     def encode_prompt(
-            self,
-            prompt,
-            do_classifier_free_guidance=True,
-            num_images_per_prompt=1,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            clean_caption: bool=False, ):
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        clean_caption: bool = False,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -369,7 +372,8 @@ def encode_prompt(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -382,31 +386,31 @@ def encode_prompt(
         max_length = 77
 
         if prompt_embeds is None:
-            prompt = self._text_preprocessing(
-                prompt, clean_caption=clean_caption)
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, max_length - 1:-1])
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}")
+                    f" {max_length} tokens: {removed_text}"
+                )
             attention_mask = text_inputs.attention_mask
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         if self.text_encoder is not None:
@@ -421,8 +425,7 @@ def encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -435,12 +438,12 @@ def encode_prompt(
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_tokens = self._text_preprocessing(
-                uncond_tokens, clean_caption=clean_caption)
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -449,12 +452,14 @@ def encode_prompt(
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = uncond_input.attention_mask
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
@@ -464,10 +469,8 @@ def encode_prompt(
             if dtype is not None:
                 negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -480,11 +483,11 @@ def encode_prompt(
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, nsfw_detected, watermark_detected = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(dtype),
+            )
         else:
             nsfw_detected = None
             watermark_detected = None
@@ -498,50 +501,48 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            original_image,
-            mask_image,
-            batch_size,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        image,
+        original_image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -554,7 +555,8 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         # image
 
@@ -563,12 +565,15 @@ def check_inputs(
         else:
             check_image_type = image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(image, list):
             image_batch_size = len(image)
@@ -582,9 +587,7 @@ def check_inputs(
             assert False
 
         if batch_size != image_batch_size:
-            raise ValueError(
-                f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
 
         # original_image
 
@@ -593,12 +596,15 @@ def check_inputs(
         else:
             check_image_type = original_image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(original_image, list):
             image_batch_size = len(original_image)
@@ -623,12 +629,15 @@ def check_inputs(
         else:
             check_image_type = mask_image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(mask_image, list):
             image_batch_size = len(mask_image)
@@ -647,8 +656,7 @@ def check_inputs(
             )
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
-    def preprocess_original_image(self,
-                                  image: PIL.Image.Image) -> paddle.Tensor:
+    def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor:
         if not isinstance(image, list):
             image = [image]
 
@@ -676,21 +684,16 @@ def numpy_to_pd(images):
             image = numpy_to_pd(image)  # to pd
 
         elif isinstance(image[0], np.ndarray):
-            image = (np.concatenate(
-                image, axis=0) if image[0].ndim == 4 else np.stack(
-                    image, axis=0))
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             image = numpy_to_pd(image)
 
         elif isinstance(image[0], paddle.Tensor):
-            image = (paddle.concat(
-                image, axis=0) if image[0].ndim == 4 else paddle.stack(
-                    image, axis=0))
+            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
 
         return image
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
-    def preprocess_image(self, image: PIL.Image.Image,
-                         num_images_per_prompt) -> paddle.Tensor:
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor:
         if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
             image = [image]
 
@@ -713,8 +716,7 @@ def preprocess_image(self, image: PIL.Image.Image,
             elif dims == 4:
                 image = paddle.concat(image, axis=0)
             else:
-                raise ValueError(
-                    f"Image must have 3 or 4 dimensions, instead got {dims}")
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
 
         image = image.cast(self.unet.dtype)
 
@@ -728,10 +730,9 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
             mask_image = [mask_image]
 
         if isinstance(mask_image[0], paddle.Tensor):
-            mask_image = (paddle.concat(
-                mask_image, axis=0)
-                          if mask_image[0].ndim == 4 else paddle.stack(
-                              mask_image, axis=0))
+            mask_image = (
+                paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0)
+            )
 
             if mask_image.ndim == 2:
                 # Batch and add channel dim for single mask
@@ -767,8 +768,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
             mask_image = paddle.to_tensor(mask_image)
 
         elif isinstance(mask_image[0], np.ndarray):
-            mask_image = np.concatenate(
-                [m[None, None, :] for m in mask_image], axis=0)
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
 
             mask_image[mask_image < 0.5] = 0
             mask_image[mask_image >= 0.5] = 1
@@ -779,8 +779,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
@@ -789,14 +788,15 @@ def get_timesteps(self, num_inference_steps, strength):
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images
     def prepare_intermediate_images(
-            self,
-            image,
-            timestep,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            mask_image,
-            generator=None, ):
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        mask_image,
+        generator=None,
+    ):
         image_batch_size, channels, height, width = image.shape
 
         batch_size = batch_size * num_images_per_prompt
@@ -821,32 +821,43 @@ def prepare_intermediate_images(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
-            original_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray,
-                                  List[PIL.Image.Image], List[
-                                      paddle.Tensor], List[np.ndarray], ]=None,
-            mask_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[
-                PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None,
-            strength: float=0.8,
-            prompt: Union[str, List[str]]=None,
-            num_inference_steps: int=100,
-            timesteps: List[int]=None,
-            guidance_scale: float=4.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            noise_level: int=0,
-            clean_caption: bool=True, ):
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
+        original_image: Union[
+            PIL.Image.Image,
+            paddle.Tensor,
+            np.ndarray,
+            List[PIL.Image.Image],
+            List[paddle.Tensor],
+            List[np.ndarray],
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image,
+            paddle.Tensor,
+            np.ndarray,
+            List[PIL.Image.Image],
+            List[paddle.Tensor],
+            List[np.ndarray],
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        clean_caption: bool = True,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -948,7 +959,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
 
@@ -965,11 +977,11 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption, )
+            clean_caption=clean_caption,
+        )
 
         if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         dtype = prompt_embeds.dtype
 
@@ -982,8 +994,7 @@ def __call__(
             self.scheduler.set_timesteps(num_inference_steps)
             timesteps = self.scheduler.timesteps
 
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
 
         # 5. prepare original image
         original_image = self.preprocess_original_image(original_image)
@@ -994,16 +1005,13 @@ def __call__(
         mask_image = mask_image.cast(dtype)
 
         if mask_image.shape[0] == 1:
-            mask_image = mask_image.repeat_interleave(
-                batch_size * num_images_per_prompt, axis=0)
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0)
         else:
-            mask_image = mask_image.repeat_interleave(
-                num_images_per_prompt, axis=0)
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0)
 
         # 6. Prepare intermediate images
         noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile(
-            (batch_size * num_images_per_prompt, ))
+        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
 
         intermediate_images = self.prepare_intermediate_images(
             original_image,
@@ -1012,21 +1020,19 @@ def __call__(
             num_images_per_prompt,
             dtype,
             mask_image,
-            generator, )
+            generator,
+        )
 
         # 7. Prepare upscaled image and noise level
         _, _, height, width = original_image.shape
 
         image = self.preprocess_image(image, num_images_per_prompt)
 
-        upscaled = F.interpolate(
-            image, (height, width), mode="bilinear", align_corners=True)
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
 
         noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
-        noise = randn_tensor(
-            upscaled.shape, generator=generator, dtype=upscaled.dtype)
-        upscaled = self.image_noising_scheduler.add_noise(
-            upscaled, noise, timesteps=noise_level)
+        noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
 
         if do_classifier_free_guidance:
             noise_level = paddle.concat([noise_level] * 2)
@@ -1035,19 +1041,15 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 model_input = paddle.concat(
-                    [
-                        intermediate_images,
-                        upscaled.cast(intermediate_images.dtype)
-                    ],
-                    axis=1, )
-
-                model_input = (paddle.concat([model_input] * 2)
-                               if do_classifier_free_guidance else model_input)
+                    [intermediate_images, upscaled.cast(intermediate_images.dtype)],
+                    axis=1,
+                )
+
+                model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
                 model_input = self.scheduler.scale_model_input(model_input, t)
 
                 # predict the noise residual
@@ -1056,7 +1058,8 @@ def __call__(
                     t,
                     encoder_hidden_states=prompt_embeds,
                     class_labels=noise_level,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -1064,37 +1067,31 @@ def __call__(
                     noise_pred_uncond, _ = noise_pred_uncond.split(
                         [
                             model_input.shape[1] // 2,
-                            noise_pred_uncond.shape[1] - model_input.shape[1] //
-                            2,
+                            noise_pred_uncond.shape[1] - model_input.shape[1] // 2,
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
                     noise_pred_text, predicted_variance = noise_pred_text.split(
                         [
                             model_input.shape[1] // 2,
-                            noise_pred_text.shape[1] - model_input.shape[1] //
-                            2,
+                            noise_pred_text.shape[1] - model_input.shape[1] // 2,
                         ],
-                        axis=1, )
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat(
-                        [noise_pred, predicted_variance], axis=1)
+                        axis=1,
+                    )
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 prev_intermediate_images = intermediate_images
 
                 intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
 
-                intermediate_images = (
-                    1 - mask_image
-                ) * prev_intermediate_images + mask_image * intermediate_images
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, intermediate_images)
@@ -1107,16 +1104,14 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 12. Convert to PIL
             image = self.numpy_to_pil(image)
 
             # 13. Apply watermark
             if self.watermarker is not None:
-                self.watermarker.apply_watermark(image,
-                                                 self.unet.config.sample_size)
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         elif output_type == "pd":
             nsfw_detected = None
             watermark_detected = None
@@ -1127,8 +1122,7 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
@@ -1136,4 +1130,5 @@ def __call__(
         return IFPipelineOutput(
             images=image,
             nsfw_detected=nsfw_detected,
-            watermark_detected=watermark_detected, )
+            watermark_detected=watermark_detected,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index b2aa43abe1a5c..ce92083c54c1a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -22,13 +22,18 @@
 import paddle
 import paddle.nn.functional as F
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel,
-                                    T5Tokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
 
 from ...models import UNet2DConditionModel
 from ...schedulers import DDPMScheduler
-from ...utils import (BACKENDS_MAPPING, is_bs4_available, is_ftfy_available,
-                      logging, randn_tensor, replace_example_docstring)
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
@@ -86,8 +91,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline):
     watermarker: Optional[IFWatermarker]
 
     bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" +
-        "\|" + "\\" + "\/" + "\*" + r"]{1,}")  # noqa
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
 
     _optional_components = [
         "tokenizer",
@@ -98,16 +103,17 @@ class IFSuperResolutionPipeline(DiffusionPipeline):
     ]
 
     def __init__(
-            self,
-            tokenizer: T5Tokenizer,
-            text_encoder: T5EncoderModel,
-            unet: UNet2DConditionModel,
-            scheduler: DDPMScheduler,
-            image_noising_scheduler: DDPMScheduler,
-            safety_checker: Optional[IFSafetyChecker],
-            feature_extractor: Optional[CLIPImageProcessor],
-            watermarker: Optional[IFWatermarker],
-            requires_safety_checker: bool=True, ):
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -139,20 +145,19 @@ def __init__(
             image_noising_scheduler=image_noising_scheduler,
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
-            watermarker=watermarker, )
+            watermarker=watermarker,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format(
-                "Setting `clean_caption=True`"))
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
 
@@ -179,11 +184,13 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         caption = re.sub(
             r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
             "",
-            caption, )  # regex for urls
+            caption,
+        )  # regex for urls
         # html:
         caption = BeautifulSoup(caption, features="html.parser").text
 
@@ -210,7 +217,8 @@ def _clean_caption(self, caption):
         caption = re.sub(
             r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
             "-",
-            caption, )
+            caption,
+        )
 
         # кавычки к одному стандарту
         caption = re.sub(r"[`´«»“”¨]", '"', caption)
@@ -237,15 +245,13 @@ def _clean_caption(self, caption):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)",
-                         "", caption)
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
 
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
 
-        caption = re.sub(self.bad_punct_regex, r" ",
-                         caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
 
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -263,13 +269,10 @@ def _clean_caption(self, caption):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(
-            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "",
-            caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
 
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ",
-                         caption)  # j2d1a2a...
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
 
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
 
@@ -289,14 +292,15 @@ def _clean_caption(self, caption):
     @paddle.no_grad()
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
     def encode_prompt(
-            self,
-            prompt,
-            do_classifier_free_guidance=True,
-            num_images_per_prompt=1,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            clean_caption: bool=False, ):
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        clean_caption: bool = False,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -323,7 +327,8 @@ def encode_prompt(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -336,32 +341,32 @@ def encode_prompt(
         max_length = 77
 
         if prompt_embeds is None:
-            prompt = self._text_preprocessing(
-                prompt, clean_caption=clean_caption)
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, max_length - 1:-1])
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}")
+                    f" {max_length} tokens: {removed_text}"
+                )
 
             attention_mask = text_inputs.attention_mask
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         if self.text_encoder is not None:
@@ -376,8 +381,7 @@ def encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -390,12 +394,12 @@ def encode_prompt(
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_tokens = self._text_preprocessing(
-                uncond_tokens, clean_caption=clean_caption)
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -404,12 +408,14 @@ def encode_prompt(
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             attention_mask = uncond_input.attention_mask
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
@@ -419,10 +425,8 @@ def encode_prompt(
             if dtype is not None:
                 negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -435,11 +439,11 @@ def encode_prompt(
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, nsfw_detected, watermark_detected = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(dtype),
+            )
         else:
             nsfw_detected = None
             watermark_detected = None
@@ -453,49 +457,47 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            batch_size,
-            noise_level,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        image,
+        batch_size,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -508,10 +510,10 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
-        if (noise_level < 0 or noise_level >=
-                self.image_noising_scheduler.config.num_train_timesteps):
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
             raise ValueError(
                 f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})"
             )
@@ -521,12 +523,15 @@ def check_inputs(
         else:
             check_image_type = image
 
-        if (not isinstance(check_image_type, paddle.Tensor) and
-                not isinstance(check_image_type, PIL.Image.Image) and
-                not isinstance(check_image_type, np.ndarray)):
+        if (
+            not isinstance(check_image_type, paddle.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}")
+                f" {type(check_image_type)}"
+            )
 
         if isinstance(image, list):
             image_batch_size = len(image)
@@ -540,13 +545,10 @@ def check_inputs(
             assert False
 
         if batch_size != image_batch_size:
-            raise ValueError(
-                f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
 
     # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images
-    def prepare_intermediate_images(self, batch_size, num_channels, height,
-                                    width, dtype, generator):
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator):
         shape = (batch_size, num_channels, height, width)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
@@ -554,8 +556,7 @@ def prepare_intermediate_images(self, batch_size, num_channels, height,
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        intermediate_images = randn_tensor(
-            shape, generator=generator, dtype=dtype)
+        intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype)
 
         # scale the initial noise by the standard deviation required by the scheduler
         intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
@@ -584,8 +585,7 @@ def preprocess_image(self, image, num_images_per_prompt):
             elif dims == 4:
                 image = paddle.concat(image, axis=0)
             else:
-                raise ValueError(
-                    f"Image must have 3 or 4 dimensions, instead got {dims}")
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
 
         image = image.cast(self.unet.dtype)
 
@@ -596,28 +596,28 @@ def preprocess_image(self, image, num_images_per_prompt):
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: int=None,
-            width: int=None,
-            image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor]=None,
-            num_inference_steps: int=50,
-            timesteps: List[int]=None,
-            guidance_scale: float=4.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            noise_level: int=250,
-            clean_caption: bool=True, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: int = None,
+        width: int = None,
+        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -709,7 +709,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
 
@@ -729,11 +730,11 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption, )
+            clean_caption=clean_caption,
+        )
 
         if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         # 4. Prepare timesteps
         if timesteps is not None:
@@ -752,39 +753,33 @@ def __call__(
             height,
             width,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Prepare upscaled image and noise level
         image = self.preprocess_image(image, num_images_per_prompt)
-        upscaled = F.interpolate(
-            image, (height, width), mode="bilinear", align_corners=True)
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
 
         noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
-        noise = randn_tensor(
-            upscaled.shape, generator=generator, dtype=upscaled.dtype)
-        upscaled = self.image_noising_scheduler.add_noise(
-            upscaled, noise, timesteps=noise_level)
+        noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
 
         if do_classifier_free_guidance:
             noise_level = paddle.concat([noise_level] * 2)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 model_input = paddle.concat(
-                    [
-                        intermediate_images,
-                        upscaled.cast(intermediate_images.dtype)
-                    ],
-                    axis=1, )
-
-                model_input = (paddle.concat([model_input] * 2)
-                               if do_classifier_free_guidance else model_input)
+                    [intermediate_images, upscaled.cast(intermediate_images.dtype)],
+                    axis=1,
+                )
+
+                model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
                 model_input = self.scheduler.scale_model_input(model_input, t)
 
                 # predict the noise residual
@@ -794,7 +789,8 @@ def __call__(
                     encoder_hidden_states=prompt_embeds,
                     class_labels=noise_level,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -802,21 +798,19 @@ def __call__(
                     noise_pred_uncond, _ = noise_pred_uncond.split(
                         [
                             model_input.shape[1] // 2,
-                            noise_pred_uncond.shape[1] - model_input.shape[1] //
-                            2,
+                            noise_pred_uncond.shape[1] - model_input.shape[1] // 2,
                         ],
-                        axis=1, )
+                        axis=1,
+                    )
                     noise_pred_text, predicted_variance = noise_pred_text.split(
                         [
                             model_input.shape[1] // 2,
-                            noise_pred_text.shape[1] - model_input.shape[1] //
-                            2,
+                            noise_pred_text.shape[1] - model_input.shape[1] // 2,
                         ],
-                        axis=1, )
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat(
-                        [noise_pred, predicted_variance], axis=1)
+                        axis=1,
+                    )
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 intermediate_images = self.scheduler.step(
@@ -824,12 +818,11 @@ def __call__(
                     t,
                     intermediate_images,
                     **extra_step_kwargs,
-                    return_dict=False, )[0]
+                    return_dict=False,
+                )[0]
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, intermediate_images)
@@ -842,16 +835,14 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 10. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 11. Convert to PIL
             image = self.numpy_to_pil(image)
 
             # 12. Apply watermark
             if self.watermarker is not None:
-                self.watermarker.apply_watermark(image,
-                                                 self.unet.config.sample_size)
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         elif output_type == "pd":
             nsfw_detected = None
             watermark_detected = None
@@ -862,8 +853,7 @@ def __call__(
             image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
 
             # 10. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
@@ -871,4 +861,5 @@ def __call__(
         return IFPipelineOutput(
             images=image,
             nsfw_detected=nsfw_detected,
-            watermark_detected=watermark_detected, )
+            watermark_detected=watermark_detected,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
index 8fcd1ab740f28..e4f32ce9b69a9 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
@@ -15,8 +15,11 @@
 import numpy as np
 import paddle
 import paddle.nn as nn
-from paddlenlp.transformers import (CLIPConfig, CLIPVisionModelWithProjection,
-                                    PretrainedModel)
+from paddlenlp.transformers import (
+    CLIPConfig,
+    CLIPVisionModelWithProjection,
+    PretrainedModel,
+)
 
 from ...utils import logging
 
@@ -46,7 +49,8 @@ def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
         if any(nsfw_detected):
             logger.warning(
                 "Potential NSFW content was detected in one or more images. A black image will be returned instead."
-                " Try again with a different prompt and/or seed.")
+                " Try again with a different prompt and/or seed."
+            )
 
         for idx, nsfw_detected_ in enumerate(nsfw_detected):
             if nsfw_detected_:
@@ -60,7 +64,8 @@ def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
         if any(watermark_detected):
             logger.warning(
                 "Potential watermarked content was detected in one or more images. A black image will be returned instead."
-                " Try again with a different prompt and/or seed.")
+                " Try again with a different prompt and/or seed."
+            )
 
         for idx, watermark_detected_ in enumerate(watermark_detected):
             if watermark_detected_:
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
index 998eb357d858a..ad156baf5b46f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
+++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
@@ -29,8 +29,8 @@ def __init__(self):
 
         self.register_buffer(
             "watermark_image",
-            paddle.zeros(
-                (62, 62, 4), dtype=paddle.get_default_dtype()), )
+            paddle.zeros((62, 62, 4), dtype=paddle.get_default_dtype()),
+        )
         self.watermark_image_as_pil = None
 
     def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
@@ -45,9 +45,8 @@ def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
         img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w)
 
         S1, S2 = 1024**2, img_w * img_h
-        K = (S2 / S1)**0.5
-        wm_size, wm_x, wm_y = int(K *
-                                  62), img_w - int(14 * K), img_h - int(14 * K)
+        K = (S2 / S1) ** 0.5
+        wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K)
 
         if self.watermark_image_as_pil is None:
             watermark_image = self.watermark_image.cpu().numpy().astype("uint8")
@@ -55,12 +54,14 @@ def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
             self.watermark_image_as_pil = watermark_image
 
         wm_img = self.watermark_image_as_pil.resize(
-            (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+            (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None
+        )
 
         for pil_img in images:
             pil_img.paste(
                 wm_img,
                 box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y),
-                mask=wm_img.split()[-1], )
+                mask=wm_img.split()[-1],
+            )
 
         return images
diff --git a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
index faf4f122a123f..ff5d4541cde55 100644
--- a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
+++ b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
@@ -44,14 +44,14 @@ class DiTPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            transformer: Transformer2DModel,
-            vae: AutoencoderKL,
-            scheduler: KarrasDiffusionSchedulers,
-            id2label: Optional[Dict[int, str]]=None, ):
+        self,
+        transformer: Transformer2DModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+        id2label: Optional[Dict[int, str]] = None,
+    ):
         super().__init__()
-        self.register_modules(
-            transformer=transformer, vae=vae, scheduler=scheduler)
+        self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
 
         # create a imagenet -> id dictionary for easier use
         self.labels = {}
@@ -88,14 +88,14 @@ def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
 
     @paddle.no_grad()
     def __call__(
-            self,
-            class_labels: List[int],
-            guidance_scale: float=4.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            num_inference_steps: int=50,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        class_labels: List[int],
+        guidance_scale: float = 4.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -123,24 +123,22 @@ def __call__(
         latents = randn_tensor(
             shape=(batch_size, latent_channels, latent_size, latent_size),
             generator=generator,
-            dtype=self.transformer.dtype, )
-        latent_model_input = (paddle.concat([latents] * 2)
-                              if guidance_scale > 1 else latents)
+            dtype=self.transformer.dtype,
+        )
+        latent_model_input = paddle.concat([latents] * 2) if guidance_scale > 1 else latents
 
         class_labels = paddle.to_tensor(class_labels).flatten()
         class_null = paddle.to_tensor([1000] * batch_size)
-        class_labels_input = (paddle.concat([class_labels, class_null], 0)
-                              if guidance_scale > 1 else class_labels)
+        class_labels_input = paddle.concat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
 
         for t in self.progress_bar(self.scheduler.timesteps):
             if guidance_scale > 1:
-                half = latent_model_input[:len(latent_model_input) // 2]
+                half = latent_model_input[: len(latent_model_input) // 2]
                 latent_model_input = paddle.concat([half, half], axis=0)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             timesteps = t
             if not paddle.is_tensor(timesteps):
@@ -154,22 +152,25 @@ def __call__(
             elif len(timesteps.shape) == 0:
                 timesteps = timesteps[None]
             # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-            timesteps = timesteps.expand([latent_model_input.shape[0], ])
+            timesteps = timesteps.expand(
+                [
+                    latent_model_input.shape[0],
+                ]
+            )
             # predict noise model_output
             noise_pred = self.transformer(
-                latent_model_input,
-                timestep=timesteps,
-                class_labels=class_labels_input).sample
+                latent_model_input, timestep=timesteps, class_labels=class_labels_input
+            ).sample
 
             # perform guidance
             if guidance_scale > 1:
                 eps, rest = (
                     noise_pred[:, :latent_channels],
-                    noise_pred[:, latent_channels:], )
+                    noise_pred[:, latent_channels:],
+                )
                 bs = eps.shape[0]
                 # TODO torch.split vs paddle.split
-                cond_eps, uncond_eps = paddle.split(
-                    eps, [bs // 2, bs - bs // 2], axis=0)
+                cond_eps, uncond_eps = paddle.split(eps, [bs // 2, bs - bs // 2], axis=0)
 
                 half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
                 eps = paddle.concat([half_eps, half_eps], axis=0)
@@ -182,13 +183,13 @@ def __call__(
                 model_output, _ = paddle.split(
                     noise_pred,
                     [latent_channels, noise_pred.shape[1] - latent_channels],
-                    axis=1, )
+                    axis=1,
+                )
             else:
                 model_output = noise_pred
 
             # compute previous image: x_t -> x_t-1
-            latent_model_input = self.scheduler.step(
-                model_output, t, latent_model_input).prev_sample
+            latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
 
         if guidance_scale > 1:
             latents, _ = latent_model_input.chunk(2, axis=0)
@@ -207,6 +208,6 @@ def __call__(
             samples = self.numpy_to_pil(samples)
 
         if not return_dict:
-            return (samples, )
+            return (samples,)
 
         return ImagePipelineOutput(images=samples)
diff --git a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
index 8f75881eec2ef..9b672f9c0f8a5 100644
--- a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
@@ -26,18 +26,38 @@
 
 from ..image_processor import VaeImageProcessor
 from ..schedulers import (
-    DDIMScheduler, DDPMScheduler, DEISMultistepScheduler,
-    DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
-    PreconfigEulerAncestralDiscreteScheduler, PreconfigLMSDiscreteScheduler,
-    UniPCMultistepScheduler)
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    PreconfigEulerAncestralDiscreteScheduler,
+    PreconfigLMSDiscreteScheduler,
+    UniPCMultistepScheduler,
+)
 from ..utils import (
-    DIFFUSERS_CACHE, FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME,
-    FROM_HF_HUB, HF_HUB_OFFLINE, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME,
-    PPDIFFUSERS_CACHE, _add_variant, _get_model_file, is_fastdeploy_available,
-    is_paddle_available, logging, randn_tensor)
+    DIFFUSERS_CACHE,
+    FASTDEPLOY_MODEL_NAME,
+    FASTDEPLOY_WEIGHTS_NAME,
+    FROM_HF_HUB,
+    HF_HUB_OFFLINE,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
+    PPDIFFUSERS_CACHE,
+    _add_variant,
+    _get_model_file,
+    is_fastdeploy_available,
+    is_paddle_available,
+    logging,
+    randn_tensor,
+)
 from ..version import VERSION as __version__
 
 __all__ = ["FastDeployRuntimeModel", "FastDeployDiffusionPipelineMixin"]
@@ -54,9 +74,7 @@ def fdtensor2pdtensor(fdtensor: "fd.C.FDTensor"):
         pdtensor = paddle.utils.dlpack.from_dlpack(dltensor)
         return pdtensor
 
-    def pdtensor2fdtensor(pdtensor: paddle.Tensor,
-                          name: str="",
-                          share_with_raw_ptr=False):
+    def pdtensor2fdtensor(pdtensor: paddle.Tensor, name: str = "", share_with_raw_ptr=False):
         if not share_with_raw_ptr:
             dltensor = paddle.utils.dlpack.to_dlpack(pdtensor)
             return fd.C.FDTensor.from_dlpack(name, dltensor)
@@ -67,7 +85,8 @@ def pdtensor2fdtensor(pdtensor: paddle.Tensor,
                 pdtensor.shape,
                 pdtensor.dtype.name,
                 str(pdtensor.place),
-                int(pdtensor.place.gpu_device_id()), )
+                int(pdtensor.place.gpu_device_id()),
+            )
 
 
 logger = logging.get_logger(__name__)
@@ -88,7 +107,8 @@ def pdtensor2fdtensor(pdtensor: paddle.Tensor,
 [^\\()\[\]:]+|
 :
 """,
-    re.X, )
+    re.X,
+)
 
 
 def parse_prompt_attention(text):
@@ -207,32 +227,20 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
         tokens.append(text_token)
         weights.append(text_weight)
     if truncated:
-        logger.warning(
-            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
-        )
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
 
 
-def pad_tokens_and_weights(tokens,
-                           weights,
-                           max_length,
-                           bos,
-                           eos,
-                           pad,
-                           no_boseos_middle=True,
-                           chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
     r"""
     Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
     """
     max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = (max_length if no_boseos_middle else
-                      max_embeddings_multiples * chunk_length)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
     for i in range(len(tokens)):
-        tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
-                     (max_length - 2 - len(tokens[i])))
+        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
         if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
-                                                       len(weights[i]))
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
         else:
             w = []
             if len(weights[i]) == 0:
@@ -240,23 +248,21 @@ def pad_tokens_and_weights(tokens,
             else:
                 for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2):min(
-                        len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
     # we must to tensor first!
-    return paddle.to_tensor(
-        tokens, dtype="int64"), paddle.to_tensor(
-            weights, dtype="float32")
+    return paddle.to_tensor(tokens, dtype="int64"), paddle.to_tensor(weights, dtype="float32")
 
 
 def get_unweighted_text_embeddings(
-        pipe,
-        text_input: paddle.Tensor,
-        chunk_length: int,
-        no_boseos_middle: Optional[bool]=True,
-        infer_op=None, ):
+    pipe,
+    text_input: paddle.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+    infer_op=None,
+):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
     it should be split into chunks and sent to the text encoder individually.
@@ -267,8 +273,7 @@ def get_unweighted_text_embeddings(
         text_embeddings = []
         for i in range(max_embeddings_multiples):
             # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
-                chunk_length - 2) + 2].clone()
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
 
             # cover the head and the tail by the starting and the ending tokens
             text_input_chunk[:, 0] = text_input[0, 0]
@@ -282,7 +287,8 @@ def get_unweighted_text_embeddings(
             text_embedding = pipe.text_encoder(
                 input_ids=text_input_chunk,
                 infer_op=infer_op,
-                output_shape=output_shape, )[0]
+                output_shape=output_shape,
+            )[0]
             if no_boseos_middle:
                 if i == 0:
                     # discard the ending token
@@ -305,20 +311,22 @@ def get_unweighted_text_embeddings(
         text_embeddings = pipe.text_encoder(
             input_ids=text_input,
             infer_op=infer_op,
-            output_shape=output_shape, )[0]
+            output_shape=output_shape,
+        )[0]
     return text_embeddings
 
 
 def get_weighted_text_embeddings(
-        pipe,
-        prompt: Union[str, List[str]],
-        uncond_prompt: Optional[Union[str, List[str]]]=None,
-        max_embeddings_multiples: Optional[int]=1,
-        no_boseos_middle: Optional[bool]=False,
-        skip_parsing: Optional[bool]=False,
-        skip_weighting: Optional[bool]=False,
-        infer_op=None,
-        **kwargs, ):
+    pipe,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 1,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    infer_op=None,
+    **kwargs,
+):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
     prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -342,24 +350,19 @@ def get_weighted_text_embeddings(
         skip_weighting (`bool`, *optional*, defaults to `False`):
             Skip the weighting. When the parsing is skipped, it is forced True.
     """
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     if isinstance(prompt, str):
         prompt = [prompt]
 
     if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
-                                                                 max_length - 2)
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
         if uncond_prompt is not None:
             if isinstance(uncond_prompt, str):
                 uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(
-                pipe, uncond_prompt, max_length - 2)
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
     else:
         prompt_tokens = [
-            token[1:-1]
-            for token in pipe.tokenizer(
-                prompt, max_length=max_length, truncation=True).input_ids
+            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
         ]
         prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
         if uncond_prompt is not None:
@@ -367,33 +370,26 @@ def get_weighted_text_embeddings(
                 uncond_prompt = [uncond_prompt]
             uncond_tokens = [
                 token[1:-1]
-                for token in pipe.tokenizer(
-                    uncond_prompt, max_length=max_length, truncation=True)
-                .input_ids
+                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
             ]
             uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
 
     # round up the longest length of tokens to a multiple of (model_max_length - 2)
     max_length = max([len(token) for token in prompt_tokens])
     if uncond_prompt is not None:
-        max_length = max(max_length,
-                         max([len(token) for token in uncond_tokens]))
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
 
     max_embeddings_multiples = min(
         max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
 
     # pad the length of tokens and weights
     # support bert tokenizer
-    bos = (pipe.tokenizer.bos_token_id
-           if pipe.tokenizer.bos_token_id is not None else
-           pipe.tokenizer.cls_token_id)
-    eos = (pipe.tokenizer.eos_token_id
-           if pipe.tokenizer.eos_token_id is not None else
-           pipe.tokenizer.sep_token_id)
+    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
     pad = pipe.tokenizer.pad_token_id
 
     prompt_tokens, prompt_weights = pad_tokens_and_weights(
@@ -404,7 +400,8 @@ def get_weighted_text_embeddings(
         eos,
         pad,
         no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length, )
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
     if uncond_prompt is not None:
         uncond_tokens, uncond_weights = pad_tokens_and_weights(
             uncond_tokens,
@@ -414,35 +411,34 @@ def get_weighted_text_embeddings(
             eos,
             pad,
             no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length, )
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
     # get the embeddings
     text_embeddings = get_unweighted_text_embeddings(
         pipe,
         prompt_tokens,
         pipe.tokenizer.model_max_length,
         no_boseos_middle=no_boseos_middle,
-        infer_op=infer_op, )
+        infer_op=infer_op,
+    )
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
             pipe,
             uncond_tokens,
             pipe.tokenizer.model_max_length,
             no_boseos_middle=no_boseos_middle,
-            infer_op=infer_op, )
+            infer_op=infer_op,
+        )
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
     if (not skip_parsing) and (not skip_weighting):
         previous_mean = text_embeddings.mean(axis=[-2, -1])
         text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (
-            (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1)
-            .unsqueeze(-1))
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
         if uncond_prompt is not None:
             previous_mean = uncond_embeddings.mean(axis=[-2, -1])
             uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (
-                (previous_mean / uncond_embeddings.mean(axis=[-2, -1]))
-                .unsqueeze(-1).unsqueeze(-1))
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
 
     if uncond_prompt is not None:
         return text_embeddings, uncond_embeddings
@@ -459,8 +455,7 @@ def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs):
                 continue
             module = getattr(self, name)
             if isinstance(module, FastDeployRuntimeModel):
-                infer_op = (infer_op_dict.get(name, "zero_copy_infer")
-                            if module.is_spport_zero_copy() else "raw")
+                infer_op = infer_op_dict.get(name, "zero_copy_infer") if module.is_spport_zero_copy() else "raw"
                 # if parse_prompt_type in ["lpw", "webui"] and name in ["text_encoder"]:
                 #     if infer_op != "raw":
                 #         logger.warning(
@@ -470,19 +465,16 @@ def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs):
                 new_infer_op_dict[name] = infer_op
         return new_infer_op_dict
 
-    def post_init(self,
-                  vae_scaling_factor=0.18215,
-                  vae_scale_factor=8,
-                  dtype="float32"):
+    def post_init(self, vae_scaling_factor=0.18215, vae_scale_factor=8, dtype="float32"):
         self.vae_scaling_factor = vae_scaling_factor
         self.vae_scale_factor = vae_scale_factor
 
-        self.image_processor = VaeImageProcessor(
-            vae_scale_factor=vae_scale_factor, do_convert_rgb=True)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_convert_rgb=True)
         self.control_image_processor = VaeImageProcessor(
             vae_scale_factor=self.vae_scale_factor,
             do_convert_rgb=True,
-            do_normalize=False, )
+            do_normalize=False,
+        )
         self.dtype = dtype
         self.supported_scheduler = [
             "pndm",
@@ -533,53 +525,44 @@ def text_encoder_hidden_states_dim(self):
     def change_scheduler(self, scheduler_type="ddim", inplace=True):
         scheduler_type = scheduler_type.lower()
         if scheduler_type == "pndm":
-            scheduler = PNDMScheduler.from_config(
-                self.orginal_scheduler_config, skip_prk_steps=True)
+            scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
         elif scheduler_type == "lms":
-            scheduler = LMSDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "preconfig-lms":
-            scheduler = PreconfigLMSDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = PreconfigLMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "heun":
-            scheduler = HeunDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "euler":
-            scheduler = EulerDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "euler-ancestral":
-            scheduler = EulerAncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "preconfig-euler-ancestral":
-            scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "dpm-multi":
-            scheduler = DPMSolverMultistepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "dpm-single":
-            scheduler = DPMSolverSinglestepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "kdpm2-ancestral":
-            scheduler = KDPM2AncestralDiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "kdpm2":
-            scheduler = KDPM2DiscreteScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "unipc-multi":
-            scheduler = UniPCMultistepScheduler.from_config(
-                self.orginal_scheduler_config)
+            scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
         elif scheduler_type == "ddim":
             scheduler = DDIMScheduler.from_config(
                 self.orginal_scheduler_config,
                 steps_offset=1,
                 clip_sample=False,
-                set_alpha_to_one=False, )
+                set_alpha_to_one=False,
+            )
         elif scheduler_type == "ddpm":
-            scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config,
-                                                  )
+            scheduler = DDPMScheduler.from_config(
+                self.orginal_scheduler_config,
+            )
         elif scheduler_type == "deis-multi":
             scheduler = DEISMultistepScheduler.from_config(
-                self.orginal_scheduler_config, )
+                self.orginal_scheduler_config,
+            )
         else:
             raise ValueError(
                 f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
@@ -590,16 +573,13 @@ def change_scheduler(self, scheduler_type="ddim", inplace=True):
 
     def get_timesteps(self, num_inference_steps, strength=1.0):
         if strength >= 1:
-            return self.scheduler.timesteps.cast(
-                self.dtype), num_inference_steps
+            return self.scheduler.timesteps.cast(self.dtype), num_inference_steps
 
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[
-            t_start * self.scheduler.order:].cast(self.dtype)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].cast(self.dtype)
 
         if hasattr(self.scheduler, "step_index_offset"):
             self.scheduler.step_index_offset = t_start * self.scheduler.order
@@ -615,24 +595,24 @@ def get_timesteps(self, num_inference_steps, strength=1.0):
         return timesteps, num_inference_steps
 
     def prepare_controlnet_cond(
-            self,
-            controlnet_cond,
-            controlnet_conditioning_scale,
-            width,
-            height,
-            batch_size,
-            num_images_per_prompt,
-            do_classifier_free_guidance=False, ):
+        self,
+        controlnet_cond,
+        controlnet_conditioning_scale,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance=False,
+    ):
         control_image = self.control_image_processor.preprocess(
             controlnet_cond,
             height=height,
-            width=width, )
+            width=width,
+        )
         if isinstance(controlnet_conditioning_scale, (float, int)):
-            controlnet_conditioning_scale = paddle.to_tensor(
-                [controlnet_conditioning_scale] * 13, dtype=self.dtype)
+            controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=self.dtype)
         elif isinstance(controlnet_conditioning_scale, (list, tuple)):
-            controlnet_conditioning_scale = paddle.to_tensor(
-                controlnet_conditioning_scale, dtype=self.dtype)
+            controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=self.dtype)
         else:
             raise ValueError(
                 f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}"
@@ -650,40 +630,40 @@ def prepare_controlnet_cond(
         return control_image, controlnet_conditioning_scale
 
     def check_inputs(
-            self,
-            prompt,
-            height=512,
-            width=512,
-            callback_steps=1,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None,
-            strength=1.0, ):
+        self,
+        prompt,
+        height=512,
+        width=512,
+        callback_steps=1,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        strength=1.0,
+    ):
         if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
             raise ValueError(
                 f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
             )
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -696,24 +676,25 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
     def prepare_latents(
-            self,
-            batch_size,
-            height,
-            width,
-            generator,
-            latents=None,
-            image=None,
-            timestep=None,
-            is_strength_max=True,
-            return_noise=False,
-            return_image_latents=False,
-            infer_op=None, ):
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+        infer_op=None,
+    ):
         shape = [
             batch_size,
             self.vae_decoder_num_latent_channels,
@@ -739,46 +720,44 @@ def prepare_latents(
         if latents is None:
             noise = randn_tensor(shape, generator=generator, dtype=self.dtype)
             # if strength is 1. then initialise the latents to noise, else initial to image + noise
-            latents = (noise if is_strength_max else
-                       self.scheduler.add_noise(image_latents, noise, timestep))
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
             # if pure noise then scale the initial latents by the  Scheduler's init sigma
-            latents = (latents * self.scheduler.init_noise_sigma
-                       if is_strength_max else latents)
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
         else:
             noise = latents
             if str(noise.dtype).replace("paddle.", "") != self.dtype:
                 noise = noise.cast(self.dtype)
             latents = noise * self.scheduler.init_noise_sigma
 
-        outputs = (latents, )
+        outputs = (latents,)
 
         if return_noise:
-            outputs += (noise, )
+            outputs += (noise,)
 
         if return_image_latents:
-            outputs += (image_latents, )
+            outputs += (image_latents,)
 
         if len(outputs) == 1:
             outputs = latents
         return outputs
 
     def prepare_mask_latents(
-            self,
-            mask,
-            masked_image,
-            batch_size,
-            height,
-            width,
-            do_classifier_free_guidance,
-            return_masked_image_latents=True,
-            infer_op=None, ):
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        do_classifier_free_guidance,
+        return_masked_image_latents=True,
+        infer_op=None,
+    ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
         mask = paddle.nn.functional.interpolate(
-            mask,
-            size=(height // self.vae_scale_factor,
-                  width // self.vae_scale_factor))
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
         mask = mask.cast(dtype=self.dtype)
 
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -791,8 +770,7 @@ def prepare_mask_latents(
                 )
             mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
 
-        mask = paddle.concat([mask] *
-                             2) if do_classifier_free_guidance else mask
+        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
         if not return_masked_image_latents:
             return mask
 
@@ -805,20 +783,18 @@ def prepare_mask_latents(
                     f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
                     " Make sure the number of images that you pass is divisible by the total requested batch size."
                 )
-            masked_image_latents = masked_image_latents.tile(
-                [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
 
-        masked_image_latents = (paddle.concat([masked_image_latents] * 2)
-                                if do_classifier_free_guidance else
-                                masked_image_latents)
+        masked_image_latents = (
+            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
 
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.cast(dtype=self.dtype)
         return mask, masked_image_latents
 
     def is_scheduler_support_step_index(self):
-        kwargs_keys = set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys())
         return "kwargs" in kwargs_keys or "step_index" in kwargs_keys
 
     def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs):
@@ -832,14 +808,12 @@ def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs):
         image_latents = self.vae_encoder(
             sample=image,
             infer_op=infer_op,
-            output_shape=output_shape, )[0]
+            output_shape=output_shape,
+        )[0]
 
         return self.vae_scaling_factor * image_latents
 
-    def _decode_vae_latents(self,
-                            latents: paddle.Tensor,
-                            infer_op=None,
-                            **kwargs):
+    def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs):
         latents_shape = latents.shape
         output_shape = [
             latents_shape[0],
@@ -850,22 +824,24 @@ def _decode_vae_latents(self,
         images_vae = self.vae_decoder(
             latent_sample=latents,
             infer_op=infer_op,
-            output_shape=output_shape, )[0]
+            output_shape=output_shape,
+        )[0]
 
         return images_vae
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            infer_op=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            **kwargs, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        infer_op=None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        **kwargs,
+    ):
         if parse_prompt_type == "lpw":
             return self._encode_prompt_lpw(
                 prompt,
@@ -876,7 +852,8 @@ def _encode_prompt(
                 negative_prompt_embeds=negative_prompt_embeds,
                 max_embeddings_multiples=max_embeddings_multiples,
                 infer_op="raw",  # NOTE: we can't use zero copy!
-                **kwargs, )
+                **kwargs,
+            )
         elif parse_prompt_type == "raw":
             return self._encode_prompt_raw(
                 prompt,
@@ -885,22 +862,23 @@ def _encode_prompt(
                 negative_prompt=negative_prompt,
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
-                infer_op=infer_op, )
+                infer_op=infer_op,
+            )
         elif parse_prompt_type == "webui":
-            raise NotImplementedError(
-                "`parse_prompt_type=webui` is not implemented yet.")
+            raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.")
 
     def _encode_prompt_lpw(
-            self,
-            prompt: Union[str, List[str]],
-            num_images_per_prompt: int,
-            do_classifier_free_guidance: bool,
-            negative_prompt: Union[str, List[str]],
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            infer_op=None,
-            max_embeddings_multiples: Optional[int]=3,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Union[str, List[str]],
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        infer_op=None,
+        max_embeddings_multiples: Optional[int] = 3,
+        **kwargs,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -930,18 +908,19 @@ def _encode_prompt_lpw(
             if do_classifier_free_guidance:
                 if negative_prompt is None:
                     uncond_tokens = [""] * batch_size
-                elif prompt is not None and type(prompt) is not type(
-                        negative_prompt):
+                elif prompt is not None and type(prompt) is not type(negative_prompt):
                     raise TypeError(
                         f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                        f" {type(prompt)}.")
+                        f" {type(prompt)}."
+                    )
                 elif isinstance(negative_prompt, str):
                     uncond_tokens = [negative_prompt]
                 elif batch_size != len(negative_prompt):
                     raise ValueError(
                         f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                         f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                        " the batch size of `prompt`.")
+                        " the batch size of `prompt`."
+                    )
                 else:
                     uncond_tokens = negative_prompt
 
@@ -951,37 +930,35 @@ def _encode_prompt_lpw(
                 uncond_prompt=uncond_tokens,
                 max_embeddings_multiples=max_embeddings_multiples,
                 infer_op=infer_op,
-                **kwargs, )
+                **kwargs,
+            )
 
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
         return prompt_embeds
 
     def _encode_prompt_raw(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            infer_op=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        infer_op=None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -1018,21 +995,22 @@ def _encode_prompt_raw(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest",
-                return_tensors="pd").input_ids  # check
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids  # check
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
             prompt_embeds = self.text_encoder(
                 input_ids=text_input_ids,
@@ -1041,13 +1019,13 @@ def _encode_prompt_raw(
                     batch_size,
                     self.tokenizer.model_max_length,
                     self.text_encoder_hidden_states_dim,
-                ], )[0]
+                ],
+            )[0]
 
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -1056,14 +1034,16 @@ def _encode_prompt_raw(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -1073,7 +1053,8 @@ def _encode_prompt_raw(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             negative_prompt_embeds = self.text_encoder(
                 input_ids=uncond_input.input_ids,
                 infer_op=infer_op,
@@ -1081,21 +1062,19 @@ def _encode_prompt_raw(
                     batch_size,
                     max_length,
                     self.text_encoder_hidden_states_dim,
-                ], )[0]
+                ],
+            )[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -1104,17 +1083,15 @@ def run_safety_checker(self, image):
             has_nsfw_concept = None
         else:
             if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(
-                    image, output_type="pil")
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
             else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(
-                    image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="np")
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="np")
             image, has_nsfw_concept = self.safety_checker(
                 images=image.numpy(),
                 clip_input=safety_checker_input.pixel_values.astype(self.dtype),
-                infer_op="raw", )
+                infer_op="raw",
+            )
             image = paddle.to_tensor(image, dtype=self.dtype)
         return image, has_nsfw_concept
 
@@ -1124,15 +1101,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -1140,9 +1115,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
 
 class FastDeployRuntimeModel:
     def __init__(self, model=None, **kwargs):
-        logger.info(
-            "`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future."
-        )
+        logger.info("`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future.")
         self.model = model
         self.model_save_dir = kwargs.get("model_save_dir", None)
         self.model_format = kwargs.get("model_format", None)
@@ -1171,11 +1144,12 @@ def is_spport_zero_copy(self):
             return False
 
     def zero_copy_infer(
-            self,
-            prebinded_inputs: dict,
-            prebinded_outputs: dict,
-            share_with_raw_ptr=True,
-            **kwargs, ):
+        self,
+        prebinded_inputs: dict,
+        prebinded_outputs: dict,
+        share_with_raw_ptr=True,
+        **kwargs,
+    ):
         """
         Execute inference without copying data from cpu to gpu.
 
@@ -1186,17 +1160,11 @@ def zero_copy_infer(
             List of output tensor.
         """
         for inputs_name, inputs_tensor in prebinded_inputs.items():
-            input_fdtensor = pdtensor2fdtensor(
-                inputs_tensor,
-                inputs_name,
-                share_with_raw_ptr=share_with_raw_ptr)
+            input_fdtensor = pdtensor2fdtensor(inputs_tensor, inputs_name, share_with_raw_ptr=share_with_raw_ptr)
             self.model.bind_input_tensor(inputs_name, input_fdtensor)
 
         for outputs_name, outputs_tensor in prebinded_outputs.items():
-            output_fdtensor = pdtensor2fdtensor(
-                outputs_tensor,
-                outputs_name,
-                share_with_raw_ptr=share_with_raw_ptr)
+            output_fdtensor = pdtensor2fdtensor(outputs_tensor, outputs_name, share_with_raw_ptr=share_with_raw_ptr)
             self.model.bind_output_tensor(outputs_name, output_fdtensor)
 
         self.model.zero_copy_infer()
@@ -1222,25 +1190,27 @@ def __call__(self, **kwargs):
             self.zero_copy_infer(
                 prebinded_inputs=inputs,
                 prebinded_outputs={self.model.get_output_info(0).name: output},
-                share_with_raw_ptr=share_with_raw_ptr, )
-            return [output, ]
+                share_with_raw_ptr=share_with_raw_ptr,
+            )
+            return [
+                output,
+            ]
         elif infer_op == "raw":
             inputs = {}
             for k, v in kwargs.items():
                 if paddle.is_tensor(v):
                     v = v.numpy()
                 inputs[k] = np.array(v)
-            return [
-                paddle.to_tensor(output) for output in self.model.infer(inputs)
-            ]
+            return [paddle.to_tensor(output) for output in self.model.infer(inputs)]
         else:
             raise ValueError("Unknown infer_op {}".format(infer_op))
 
     @staticmethod
     def load_model(
-            model_path: Union[str, Path],
-            params_path: Union[str, Path]=None,
-            runtime_options: Optional["fd.RuntimeOption"]=None, ):
+        model_path: Union[str, Path],
+        params_path: Union[str, Path] = None,
+        runtime_options: Optional["fd.RuntimeOption"] = None,
+    ):
         """
         Loads an FastDeploy Inference Model with fastdeploy.RuntimeOption
 
@@ -1255,9 +1225,7 @@ def load_model(
         """
         option = runtime_options
         if option is None or not isinstance(runtime_options, fd.RuntimeOption):
-            logger.info(
-                "No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend."
-            )
+            logger.info("No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend.")
             option = fd.RuntimeOption()
             option.use_paddle_backend()
             option.use_cpu()
@@ -1275,11 +1243,12 @@ def load_model(
         return fd.Runtime(option)
 
     def _save_pretrained(
-            self,
-            save_directory: Union[str, Path],
-            model_file_name: Optional[str]=None,
-            params_file_name: Optional[str]=None,
-            **kwargs, ):
+        self,
+        save_directory: Union[str, Path],
+        model_file_name: Optional[str] = None,
+        params_file_name: Optional[str] = None,
+        **kwargs,
+    ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
         [`~FastDeployRuntimeModel.from_pretrained`] class method. It will always save the
@@ -1296,11 +1265,14 @@ def _save_pretrained(
                 model with a different name.
         """
         is_onnx_model = self.model_format == ModelFormat.ONNX
-        model_file_name = (model_file_name if model_file_name is not None else
-                           FASTDEPLOY_MODEL_NAME
-                           if not is_onnx_model else ONNX_WEIGHTS_NAME)
-        params_file_name = (params_file_name if params_file_name is not None
-                            else FASTDEPLOY_WEIGHTS_NAME)
+        model_file_name = (
+            model_file_name
+            if model_file_name is not None
+            else FASTDEPLOY_MODEL_NAME
+            if not is_onnx_model
+            else ONNX_WEIGHTS_NAME
+        )
+        params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME
 
         src_model_path = self.model_save_dir.joinpath(self.latest_model_name)
         dst_model_path = Path(save_directory).joinpath(model_file_name)
@@ -1312,19 +1284,16 @@ def _save_pretrained(
 
         if is_onnx_model:
             # copy external weights (for models >2GB)
-            src_model_path = self.model_save_dir.joinpath(
-                ONNX_EXTERNAL_WEIGHTS_NAME)
+            src_model_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
             if src_model_path.exists():
-                dst_model_path = Path(save_directory).joinpath(
-                    ONNX_EXTERNAL_WEIGHTS_NAME)
+                dst_model_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
                 try:
                     shutil.copyfile(src_model_path, dst_model_path)
                 except shutil.SameFileError:
                     pass
 
         if not is_onnx_model:
-            src_params_path = self.model_save_dir.joinpath(
-                self.latest_params_name)
+            src_params_path = self.model_save_dir.joinpath(self.latest_params_name)
             dst_params_path = Path(save_directory).joinpath(params_file_name)
             try:
                 shutil.copyfile(src_params_path, dst_params_path)
@@ -1332,9 +1301,10 @@ def _save_pretrained(
                 pass
 
     def save_pretrained(
-            self,
-            save_directory: Union[str, os.PathLike],
-            **kwargs, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        **kwargs,
+    ):
         """
         Save a model to a directory, so that it can be re-loaded using the [`~FastDeployRuntimeModel.from_pretrained`] class
         method.:
@@ -1344,9 +1314,7 @@ def save_pretrained(
                 Directory to which to save. Will be created if it doesn't exist.
         """
         if os.path.isfile(save_directory):
-            logger.error(
-                f"Provided path ({save_directory}) should be a directory, not a file"
-            )
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
 
         os.makedirs(save_directory, exist_ok=True)
@@ -1356,23 +1324,24 @@ def save_pretrained(
 
     @classmethod
     def _from_pretrained(
-            cls,
-            pretrained_model_name_or_path: Union[str, Path],
-            model_file_name: Optional[str]=None,
-            params_file_name: Optional[str]=None,
-            use_auth_token: Optional[Union[bool, str, None]]=None,
-            revision: Optional[str]=None,
-            subfolder: Optional[str]=None,
-            force_download: bool=False,
-            cache_dir: Optional[str]=None,
-            runtime_options: Optional["fd.RuntimeOption"]=None,
-            from_hf_hub: Optional[bool]=False,
-            proxies: Optional[Dict]=None,
-            resume_download: bool=False,
-            local_files_only: bool=False,
-            user_agent: Union[Dict, str, None]=None,
-            is_onnx_model: bool=False,
-            **kwargs, ):
+        cls,
+        pretrained_model_name_or_path: Union[str, Path],
+        model_file_name: Optional[str] = None,
+        params_file_name: Optional[str] = None,
+        use_auth_token: Optional[Union[bool, str, None]] = None,
+        revision: Optional[str] = None,
+        subfolder: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        runtime_options: Optional["fd.RuntimeOption"] = None,
+        from_hf_hub: Optional[bool] = False,
+        proxies: Optional[Dict] = None,
+        resume_download: bool = False,
+        local_files_only: bool = False,
+        user_agent: Union[Dict, str, None] = None,
+        is_onnx_model: bool = False,
+        **kwargs,
+    ):
         """
         Load a model from a directory or the HF Hub.
 
@@ -1404,24 +1373,25 @@ def _from_pretrained(
                 kwargs will be passed to the model during initialization
         """
 
-        model_file_name = (model_file_name if model_file_name is not None else
-                           FASTDEPLOY_MODEL_NAME
-                           if not is_onnx_model else ONNX_WEIGHTS_NAME)
-        params_file_name = (params_file_name if params_file_name is not None
-                            else FASTDEPLOY_WEIGHTS_NAME)
+        model_file_name = (
+            model_file_name
+            if model_file_name is not None
+            else FASTDEPLOY_MODEL_NAME
+            if not is_onnx_model
+            else ONNX_WEIGHTS_NAME
+        )
+        params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME
         kwargs["model_format"] = "ONNX" if is_onnx_model else "PADDLE"
 
         # load model from local directory
         if os.path.isdir(pretrained_model_name_or_path):
-            model_path = os.path.join(pretrained_model_name_or_path,
-                                      model_file_name)
-            params_path = (
-                None if is_onnx_model else
-                os.path.join(pretrained_model_name_or_path, params_file_name))
+            model_path = os.path.join(pretrained_model_name_or_path, model_file_name)
+            params_path = None if is_onnx_model else os.path.join(pretrained_model_name_or_path, params_file_name)
             model = FastDeployRuntimeModel.load_model(
                 model_path,
                 params_path,
-                runtime_options=runtime_options, )
+                runtime_options=runtime_options,
+            )
             kwargs["model_save_dir"] = Path(pretrained_model_name_or_path)
         # load model from hub or paddle bos
         else:
@@ -1437,7 +1407,8 @@ def _from_pretrained(
                 resume_download=resume_download,
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
-                user_agent=user_agent, )
+                user_agent=user_agent,
+            )
             if is_onnx_model:
                 params_cache_path = None
                 kwargs["latest_params_name"] = None
@@ -1454,7 +1425,8 @@ def _from_pretrained(
                     resume_download=resume_download,
                     local_files_only=local_files_only,
                     use_auth_token=use_auth_token,
-                    user_agent=user_agent, )
+                    user_agent=user_agent,
+                )
                 kwargs["latest_params_name"] = Path(params_cache_path).name
             kwargs["model_save_dir"] = Path(model_cache_path).parent
             kwargs["latest_model_name"] = Path(model_cache_path).name
@@ -1462,21 +1434,24 @@ def _from_pretrained(
             model = FastDeployRuntimeModel.load_model(
                 model_cache_path,
                 params_cache_path,
-                runtime_options=runtime_options, )
+                runtime_options=runtime_options,
+            )
         return cls(model=model, **kwargs)
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path: Union[str, Path],
-            model_file_name: Optional[str]=None,
-            params_file_name: Optional[str]=None,
-            runtime_options: Optional["fd.RuntimeOption"]=None,
-            is_onnx_model: bool=False,
-            **kwargs, ):
+        cls,
+        pretrained_model_name_or_path: Union[str, Path],
+        model_file_name: Optional[str] = None,
+        params_file_name: Optional[str] = None,
+        runtime_options: Optional["fd.RuntimeOption"] = None,
+        is_onnx_model: bool = False,
+        **kwargs,
+    ):
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
@@ -1508,4 +1483,5 @@ def from_pretrained(
             local_files_only=local_files_only,
             user_agent=user_agent,
             is_onnx_model=is_onnx_model,
-            **kwargs, )
+            **kwargs,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
index 0ebba5a459d49..dd119ef22d12e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
@@ -15,9 +15,11 @@
 
 # flake8: noqa
 from ...utils import is_paddlenlp_available
-from .pipeline_latent_diffusion_superresolution import \
-    LDMSuperResolutionPipeline
+from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
 
 if is_paddlenlp_available():
-    from .pipeline_latent_diffusion import (LDMBertConfig, LDMBertModel,
-                                            LDMTextToImagePipeline)
+    from .pipeline_latent_diffusion import (
+        LDMBertConfig,
+        LDMBertModel,
+        LDMTextToImagePipeline,
+    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index f0d4f43308d80..e82dda6fe1de3 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -19,16 +19,20 @@
 
 import paddle
 import paddle.nn as nn
-from paddlenlp.transformers import (PretrainedConfig, PretrainedModel,
-                                    PretrainedTokenizer, register_base_model)
-from paddlenlp.transformers.model_outputs import \
-    BaseModelOutputWithPoolingAndCrossAttentions
+from paddlenlp.transformers import (
+    PretrainedConfig,
+    PretrainedModel,
+    PretrainedTokenizer,
+    register_base_model,
+)
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import (deprecate, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
 from ...utils.initializer_utils import normal_, zeros_
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
@@ -69,34 +73,29 @@ class LDMTextToImagePipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vqvae: Union[VQModel, AutoencoderKL],
-            bert: PretrainedModel,
-            tokenizer: PretrainedTokenizer,
-            unet: Union[UNet2DModel, UNet2DConditionModel],
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler], ):
+        self,
+        vqvae: Union[VQModel, AutoencoderKL],
+        bert: PretrainedModel,
+        tokenizer: PretrainedTokenizer,
+        unet: Union[UNet2DModel, UNet2DConditionModel],
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
         super().__init__()
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -104,35 +103,25 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
         if tokenizer.model_max_length > 77:
             tokenizer.model_max_length = 77
-        self.register_modules(
-            vqvae=vqvae,
-            bert=bert,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler)
-        self.vae_scale_factor = (
-            8  # 2 ** (len(self.vqvae.config.block_out_channels) - 1)
-        )
+        self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+        self.vae_scale_factor = 8  # 2 ** (len(self.vqvae.config.block_out_channels) - 1)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -168,21 +157,25 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because LDMBert can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            prompt_embeds = self.bert(text_input_ids, )
+            prompt_embeds = self.bert(
+                text_input_ids,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.bert.dtype)
@@ -190,8 +183,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -201,14 +193,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -218,28 +212,27 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            negative_prompt_embeds = self.bert(uncond_input.input_ids, )
+            negative_prompt_embeds = self.bert(
+                uncond_input.input_ids,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.bert.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.bert.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -257,53 +250,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -316,17 +305,19 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -349,26 +340,25 @@ def prepare_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=256,
-            width: Optional[int]=256,
-            num_inference_steps: int=50,
-            guidance_scale: float=1.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ) -> Union[
-                Tuple, ImagePipelineOutput]:
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 256,
+        width: Optional[int] = 256,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[Tuple, ImagePipelineOutput]:
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -443,7 +433,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -465,7 +456,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -480,43 +472,38 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -533,7 +520,7 @@ def __call__(
             image = self.decode_latents(latents)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
 
@@ -554,25 +541,26 @@ class LDMBertConfig(PretrainedConfig):
     }
 
     def __init__(
-            self,
-            vocab_size=30522,
-            max_position_embeddings=77,
-            encoder_layers=32,
-            encoder_ffn_dim=5120,
-            encoder_attention_heads=8,
-            head_dim=64,
-            encoder_layerdrop=0.0,
-            activation_function="gelu",
-            d_model=1280,
-            dropout=0.1,
-            attention_dropout=0.0,
-            activation_dropout=0.0,
-            init_std=0.02,
-            classifier_dropout=0.0,
-            scale_embedding=False,
-            use_cache=True,
-            pad_token_id=0,
-            **kwargs, ):
+        self,
+        vocab_size=30522,
+        max_position_embeddings=77,
+        encoder_layers=32,
+        encoder_ffn_dim=5120,
+        encoder_attention_heads=8,
+        head_dim=64,
+        encoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1280,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        pad_token_id=0,
+        **kwargs,
+    ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -590,9 +578,7 @@ def __init__(
         self.classifier_dropout = classifier_dropout
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
-        self.scale_embedding = (
-            scale_embedding  # scale factor will be sqrt(d_model) if True
-        )
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
@@ -603,9 +589,7 @@ class LDMBertPretrainedModel(PretrainedModel):
     base_model_prefix = "ldmbert"
     config_class = LDMBertConfig
     _supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [
-        r"encoder\.version", r"decoder\.version"
-    ]
+    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
 
     def init_weights(self):
         """
@@ -626,9 +610,7 @@ def gradient_checkpointing_enable(self):
         activations".
         """
         if not self.supports_gradient_checkpointing:
-            raise ValueError(
-                f"{self.__class__.__name__} does not support gradient checkpointing."
-            )
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
         self.apply(partial(self._set_gradient_checkpointing, value=True))
 
     def gradient_checkpointing_disable(self):
@@ -656,15 +638,15 @@ def _init_weights(self, module):
 
 class LDMBertEmbeddings(nn.Layer):
     def __init__(
-            self,
-            vocab_size,
-            hidden_size=768,
-            hidden_dropout_prob=0.0,
-            max_position_embeddings=512, ):
+        self,
+        vocab_size,
+        hidden_size=768,
+        hidden_dropout_prob=0.0,
+        max_position_embeddings=512,
+    ):
         super().__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
-        self.position_embeddings = nn.Embedding(max_position_embeddings,
-                                                hidden_size)
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
         self.dropout = nn.Dropout(hidden_dropout_prob)
 
     def forward(self, input_ids, position_ids=None):
@@ -684,18 +666,19 @@ def forward(self, input_ids, position_ids=None):
 
 class TransformerEncoderLayer(nn.TransformerEncoderLayer):
     def __init__(
-            self,
-            d_model,
-            nhead,
-            dim_feedforward,
-            dropout=0.1,
-            activation="gelu",
-            attn_dropout=None,
-            act_dropout=None,
-            normalize_before=False,
-            weight_attr=None,
-            bias_attr=None,
-            head_dim=64, ):
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout=0.1,
+        activation="gelu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+        weight_attr=None,
+        bias_attr=None,
+        head_dim=64,
+    ):
         super().__init__(
             d_model,
             nhead,
@@ -706,7 +689,8 @@ def __init__(
             act_dropout,
             normalize_before,
             weight_attr,
-            bias_attr, )
+            bias_attr,
+        )
         # update self attn
         self.self_attn = LDMBertAttention(
             d_model,
@@ -714,7 +698,8 @@ def __init__(
             nhead,
             dropout=attn_dropout,
             weight_attr=weight_attr,
-            bias_attr=False, )
+            bias_attr=False,
+        )
 
 
 @register_base_model
@@ -727,7 +712,8 @@ def __init__(self, config: LDMBertConfig):
             config.vocab_size,
             config.d_model,
             config.dropout,
-            config.max_position_embeddings, )
+            config.max_position_embeddings,
+        )
         encoder_layer = TransformerEncoderLayer(
             config.d_model,
             config.encoder_attention_heads,
@@ -737,10 +723,10 @@ def __init__(self, config: LDMBertConfig):
             attn_dropout=config.attention_dropout,
             act_dropout=config.activation_dropout,
             normalize_before=True,
-            head_dim=config.head_dim, )
+            head_dim=config.head_dim,
+        )
 
-        self.encoder = nn.TransformerEncoder(encoder_layer,
-                                             config.encoder_layers)
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers)
         self.final_layer_norm = nn.LayerNorm(config.d_model)
         self.init_weights()
 
@@ -751,56 +737,58 @@ def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
     def forward(
-            self,
-            input_ids,
-            position_ids=None,
-            attention_mask=None,
-            output_hidden_states=False,
-            output_attentions=False,
-            return_dict=False, ):
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
 
         if attention_mask is not None and attention_mask.ndim == 2:
             # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
-            attention_mask = attention_mask.unsqueeze(
-                axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
             attention_mask = (1.0 - attention_mask) * -1e4
 
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids)
+        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
         encoder_outputs = self.encoder(
             embedding_output,
             src_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         if isinstance(encoder_outputs, type(embedding_output)):
             sequence_output = self.final_layer_norm(encoder_outputs)
-            return (sequence_output, )
+            return (sequence_output,)
         else:
             sequence_output = encoder_outputs[0]
             sequence_output = self.final_layer_norm(sequence_output)
             if not return_dict:
-                return (sequence_output, ) + encoder_outputs[1:]
+                return (sequence_output,) + encoder_outputs[1:]
             return BaseModelOutputWithPoolingAndCrossAttentions(
                 last_hidden_state=sequence_output,
                 hidden_states=encoder_outputs.hidden_states,
-                attentions=encoder_outputs.attentions, )
+                attentions=encoder_outputs.attentions,
+            )
 
 
 class LDMBertAttention(nn.MultiHeadAttention):
     def __init__(
-            self,
-            embed_dim,
-            head_dim,
-            num_heads,
-            dropout=0.0,
-            kdim=None,
-            vdim=None,
-            need_weights=False,
-            weight_attr=None,
-            bias_attr=None, ):
+        self,
+        embed_dim,
+        head_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+        weight_attr=None,
+        bias_attr=None,
+    ):
         super().__init__(
             embed_dim,
             num_heads,
@@ -809,15 +797,10 @@ def __init__(
             vdim,
             need_weights,
             weight_attr,
-            bias_attr, )
-        assert (
-            embed_dim > 0
-        ), "Expected embed_dim to be greater than 0, " "but received {}".format(
-            embed_dim)
-        assert (
-            num_heads > 0
-        ), "Expected num_heads to be greater than 0, " "but received {}".format(
-            num_heads)
+            bias_attr,
+        )
+        assert embed_dim > 0, "Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim)
+        assert num_heads > 0, "Expected num_heads to be greater than 0, " "but received {}".format(num_heads)
 
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -830,12 +813,9 @@ def __init__(
         self.inner_dim = head_dim * num_heads
         self.scaling = self.head_dim**-0.5
 
-        self.q_proj = nn.Linear(
-            embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = nn.Linear(
-            self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = nn.Linear(
-            self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
+        self.q_proj = nn.Linear(embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr)
+        self.k_proj = nn.Linear(self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
+        self.v_proj = nn.Linear(self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
         self.out_proj = nn.Linear(self.inner_dim, embed_dim, weight_attr)
 
 
@@ -847,18 +827,20 @@ def __init__(self, config: LDMBertConfig):
         self.init_weights()
 
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            position_ids=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None, ):
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
         outputs = self.ldmbert(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
         return outputs
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
index 0f37d4a18387d..24475c0af099b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -21,8 +21,13 @@
 
 from ...models import UNet2DModel, VQModel
 from ...schedulers import (
-    DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler)
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import PIL_INTERPOLATION, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
@@ -55,27 +60,32 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vqvae: VQModel,
-            unet: UNet2DModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
-                             EulerDiscreteScheduler,
-                             EulerAncestralDiscreteScheduler,
-                             DPMSolverMultistepScheduler, ], ):
+        self,
+        vqvae: VQModel,
+        unet: UNet2DModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
         super().__init__()
         self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
 
     @paddle.no_grad()
     def __call__(
-            self,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            batch_size: Optional[int]=1,
-            num_inference_steps: Optional[int]=100,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ) -> Union[Tuple, ImagePipelineOutput]:
+        self,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        batch_size: Optional[int] = 1,
+        num_inference_steps: Optional[int] = 100,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[Tuple, ImagePipelineOutput]:
         """
         Args:
             image (`paddle.Tensor` or `PIL.Image.Image`):
@@ -107,25 +117,20 @@ def __call__(
         elif isinstance(image, paddle.Tensor):
             batch_size = image.shape[0]
         else:
-            raise ValueError(
-                f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}"
-            )
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}")
         if isinstance(image, PIL.Image.Image):
             image = preprocess(image)
         height, width = image.shape[-2:]
         # in_channels should be 6: 3 for latents, 3 for low resolution image
-        latents_shape = (batch_size, self.unet.config.in_channels // 2, height,
-                         width)
+        latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width)
         latents_dtype = self.unet.dtype
-        latents = randn_tensor(
-            latents_shape, generator=generator, dtype=latents_dtype)
+        latents = randn_tensor(latents_shape, generator=generator, dtype=latents_dtype)
         image = image.cast(latents_dtype)
         self.scheduler.set_timesteps(num_inference_steps)
         timesteps_tensor = self.scheduler.timesteps
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_kwargs = {}
         if accepts_eta:
             extra_kwargs["eta"] = eta
@@ -136,8 +141,7 @@ def __call__(
             # predict the noise residual
             noise_pred = self.unet(latents_input, t).sample
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
 
         # decode the image latents with the VQVAE
         image = self.vqvae.decode(latents).sample
@@ -147,5 +151,5 @@ def __call__(
         if output_type == "pil":
             image = self.numpy_to_pil(image)
         if not return_dict:
-            return (image, )
+            return (image,)
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index 5434c0cbb084e..11e66b2063f75 100644
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -36,23 +36,21 @@ class LDMPipeline(DiffusionPipeline):
             [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
     """
 
-    def __init__(self,
-                 vqvae: VQModel,
-                 unet: UNet2DModel,
-                 scheduler: DDIMScheduler):
+    def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
         super().__init__()
         self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
 
     @paddle.no_grad()
-    def __call__(self,
-                 batch_size: int=1,
-                 generator: Optional[Union[paddle.Generator, List[
-                     paddle.Generator]]]=None,
-                 eta: float=0.0,
-                 num_inference_steps: int=50,
-                 output_type: Optional[str]="pil",
-                 return_dict: bool=True,
-                 **kwargs) -> Union[Tuple, ImagePipelineOutput]:
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs
+    ) -> Union[Tuple, ImagePipelineOutput]:
         """
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -77,8 +75,10 @@ def __call__(self,
                 batch_size,
                 self.unet.config.in_channels,
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, ),
-            generator=generator, )
+                self.unet.config.sample_size,
+            ),
+            generator=generator,
+        )
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
@@ -86,8 +86,7 @@ def __call__(self,
         self.scheduler.set_timesteps(num_inference_steps)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_kwargs = {}
         if accepts_eta:
             extra_kwargs["eta"] = eta
@@ -96,13 +95,12 @@ def __call__(self,
             # predict the noise residual
             noise_prediction = self.unet(latent_model_input, t).sample
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_prediction, t, latents,
-                                          **extra_kwargs).prev_sample
+            latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
         image = self.vqvae.decode(latents).sample
         image = (image / 2 + 0.5).clip(min=0, max=1)
         image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
         if output_type == "pil":
             image = self.numpy_to_pil(image)
         if not return_dict:
-            return (image, )
+            return (image,)
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py
index a3967f589a49a..3d31b5e95e74f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py
@@ -19,8 +19,12 @@
 import numpy as np
 import paddle
 
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
-                      is_paddle_available, is_paddlenlp_available)
+from ...utils import (
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 
 @dataclass
@@ -45,7 +49,7 @@ class VideoPipelineOutput(BaseOutput):
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_and_paddlenlp_objects import *
 else:
-    from .pipeline_latent_video_diffusion_model_text2video import \
-        LVDMTextToVideoPipeline
-    from .pipeline_latent_video_diffusion_model_uncond import \
-        LVDMUncondPipeline
+    from .pipeline_latent_video_diffusion_model_text2video import (
+        LVDMTextToVideoPipeline,
+    )
+    from .pipeline_latent_video_diffusion_model_uncond import LVDMUncondPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py
index a727ada59472b..8e339ecfee43d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py
@@ -24,8 +24,7 @@
 from ...configuration_utils import FrozenDict
 from ...models import LVDMAutoencoderKL, LVDMUNet3DModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (deprecate, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from . import VideoPipelineOutput
 from .video_save import save_results
@@ -43,12 +42,12 @@
                     prompt="cutting in kitchen",
                     num_frames=16,
                     height=256,
-                    width=256, 
-                    num_inference_steps=50, 
-                    generator=generator, 
+                    width=256,
+                    num_inference_steps=50,
+                    generator=generator,
                     guidance_scale=15,
-                    eta=1, 
-                    save_dir='.', 
+                    eta=1,
+                    save_dir='.',
                     save_name='ddim_lvdm_text_to_video_ucf',
                     encoder_type='2d',
                     scale_factor=0.18215,
@@ -64,12 +63,10 @@ def split_video_to_clips(video, clip_length, drop_left=True):
     video_length = video.shape[2]
     shape = video.shape
     if video_length % clip_length != 0 and drop_left:
-        video = video[:, :, :video_length // clip_length * clip_length, :, :]
-        print(
-            f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
+        video = video[:, :, : video_length // clip_length * clip_length, :, :]
+        print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
     nclips = video_length // clip_length
-    clips = rearrange(
-        video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
+    clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
     return clips
 
 
@@ -104,34 +101,30 @@ class LVDMTextToVideoPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: LVDMAutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: LVDMUNet3DModel,
-            scheduler: KarrasDiffusionSchedulers, ):
+        self,
+        vae: LVDMAutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: LVDMUNet3DModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -139,11 +132,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -153,7 +142,8 @@ def __init__(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
 
         # self.encoder_type = '2d'
         # self.scale_factor = 0.18215
@@ -166,12 +156,7 @@ def decode(self, z, **kwargs):
         return results
 
     @paddle.no_grad()
-    def overlapped_decode(self,
-                          z,
-                          max_z_t=None,
-                          overlap_t=2,
-                          predict_cids=False,
-                          force_not_quantize=False):
+    def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False):
         if max_z_t is None:
             max_z_t = z.shape[2]
         assert max_z_t > overlap_t
@@ -190,69 +175,56 @@ def overlapped_decode(self,
         reses = []
         for i, z_ in enumerate(zs):
             if i == 0:
-                res = self.decode(
-                    z_, predict_cids,
-                    force_not_quantize).cpu()[:, :, :max_x_t - drop_r_x, :, :]
+                res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :]
             elif i == len(zs) - 1:
-                res = self.decode(
-                    z_, predict_cids,
-                    force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
+                res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
             else:
-                res = self.decode(z_, predict_cids, force_not_quantize).cpu(
-                )[:, :, drop_l_x:max_x_t - drop_r_x, :, :]
+                res = self.decode(z_, predict_cids, force_not_quantize).cpu()[
+                    :, :, drop_l_x : max_x_t - drop_r_x, :, :
+                ]
             reses.append(res)
         results = paddle.concat(x=reses, axis=2)
         return results
 
     @paddle.no_grad()
-    def decode_first_stage_2DAE_video(self,
-                                      z,
-                                      decode_bs=16,
-                                      return_cpu=True,
-                                      **kwargs):
+    def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs):
         b, _, t, _, _ = z.shape
         z = rearrange(z, "b c t h w -> (b t) c h w")
         if decode_bs is None:
             results = self.decode(z, **kwargs)
         else:
-            z = paddle.split(
-                x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
+            z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
             if return_cpu:
-                results = paddle.concat(
-                    x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
+                results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
             else:
-                results = paddle.concat(
-                    x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
-        results = rearrange(
-            results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
+                results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
+        results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
         return results
 
     @paddle.no_grad()
     def decode_latents(
-            self,
-            z,
-            decode_bs=16,
-            return_cpu=True,
-            bs=None,
-            decode_single_video_allframes=False,
-            max_z_t=None,
-            overlapped_length=0,
-            **kwargs, ):
+        self,
+        z,
+        decode_bs=16,
+        return_cpu=True,
+        bs=None,
+        decode_single_video_allframes=False,
+        max_z_t=None,
+        overlapped_length=0,
+        **kwargs,
+    ):
         b, _, t, _, _ = z.shape
         if kwargs["encoder_type"] == "2d" and z.dim() == 5:
-            return self.decode_first_stage_2DAE_video(
-                z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
+            return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
         if decode_single_video_allframes:
             z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0)
             cat_dim = 0
         elif max_z_t is not None:
             if kwargs["encoder_type"] == "3d":
-                z = paddle.split(
-                    x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
+                z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
                 cat_dim = 2
             if kwargs["encoder_type"] == "2d":
-                z = paddle.split(
-                    x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
+                z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
                 cat_dim = 0
         # elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[
         #     2
@@ -286,8 +258,7 @@ def paddle_to_np(self, x):
 
         if isinstance("uint8", paddle.dtype):
             dtype = "uint8"
-        elif isinstance("uint8",
-                        str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
+        elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
             dtype = "uint8"
         elif isinstance("uint8", paddle.Tensor):
             dtype = "uint8".dtype
@@ -299,13 +270,14 @@ def paddle_to_np(self, x):
         return sample
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_videos_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -341,28 +313,30 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -370,8 +344,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_videos_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_videos_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_videos_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -381,14 +354,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -398,36 +373,33 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_videos_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_videos_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_videos_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_videos_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -437,53 +409,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -496,22 +464,21 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            num_frames,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
-        shape = [
-            batch_size, num_channels_latents, num_frames, height // 8,
-            width // 8
-        ]
+        self,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
+        shape = [batch_size, num_channels_latents, num_frames, height // 8, width // 8]
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -528,31 +495,31 @@ def prepare_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=256,
-            width: Optional[int]=256,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_videos_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            save_dir=None,
-            save_name=None,
-            num_frames: Optional[int]=16,
-            encoder_type="2d",
-            scale_factor=0.18215,
-            shift_factor=0, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 256,
+        width: Optional[int] = 256,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        save_dir=None,
+        save_name=None,
+        num_frames: Optional[int] = 16,
+        encoder_type="2d",
+        scale_factor=0.18215,
+        shift_factor=0,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -628,9 +595,7 @@ def __call__(
         """
         # 0. Default height and width to unet
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
@@ -640,7 +605,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -662,7 +628,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -678,43 +645,38 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     timesteps=t,
                     context=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -724,8 +686,7 @@ def __call__(
             "scale_factor": scale_factor,
             "shift_factor": shift_factor,
         }
-        sampled_videos = self.decode_latents(
-            latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs)
+        sampled_videos = self.decode_latents(latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs)
         all_videos.append(self.paddle_to_np(sampled_videos))
         all_videos = np.concatenate(all_videos, axis=0)
 
@@ -744,10 +705,9 @@ def __call__(
             videos_frames.append(video_frames)
 
         if not save_name:
-            save_name = f"defaul_video"
+            save_name = "defaul_video"
         if not save_dir:
             save_dir = "."
         os.makedirs(save_dir, exist_ok=True)
-        save_results(
-            all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
+        save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
         return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos)
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py
index 5581777325761..3d64085312440 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py
@@ -19,9 +19,6 @@
 
 import numpy as np
 import paddle
-import paddle.nn as nn
-from paddlenlp.transformers import PretrainedModel, PretrainedTokenizer
-from tqdm import trange
 
 from ...configuration_utils import FrozenDict
 from ...models import LVDMAutoencoderKL, LVDMUNet3DModel
@@ -49,34 +46,29 @@ class LVDMUncondPipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: LVDMAutoencoderKL,
-            unet: LVDMUNet3DModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler], ):
+        self,
+        vae: LVDMAutoencoderKL,
+        unet: LVDMUNet3DModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
         super().__init__()
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
         self.register_modules(vae=vae, unet=unet, scheduler=scheduler)
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
 
@@ -113,8 +105,7 @@ def paddle_to_np(self, x):
 
         if isinstance("uint8", paddle.dtype):
             dtype = "uint8"
-        elif isinstance("uint8",
-                        str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
+        elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
             dtype = "uint8"
         elif isinstance("uint8", paddle.Tensor):
             dtype = "uint8".dtype
@@ -127,25 +118,25 @@ def paddle_to_np(self, x):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            num_frames: Optional[int]=16,
-            height: Optional[int]=256,
-            width: Optional[int]=256,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            eta: Optional[float]=0.0,
-            num_inference_steps: Optional[int]=50,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            save_dir=None,
-            save_name=None,
-            scale_factor: Optional[float]=0.33422927,
-            shift_factor: Optional[float]=1.4606637,
-            **kwargs, ) -> Union[Tuple, VideoPipelineOutput]:
+        self,
+        batch_size: int = 1,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = 256,
+        width: Optional[int] = 256,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        eta: Optional[float] = 0.0,
+        num_inference_steps: Optional[int] = 50,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        save_dir=None,
+        save_name=None,
+        scale_factor: Optional[float] = 0.33422927,
+        shift_factor: Optional[float] = 1.4606637,
+        **kwargs,
+    ) -> Union[Tuple, VideoPipelineOutput]:
         r"""
         Args:
             height (`int`, *optional*, defaults to 256):
@@ -188,16 +179,15 @@ def __call__(
         """
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         # get the initial random noise unless the user supplied it
         latents_shape = [
@@ -211,12 +201,11 @@ def __call__(
         if latents is None:
             latents = randn_tensor(
                 latents_shape,
-                generator=generator, )
+                generator=generator,
+            )
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
@@ -231,30 +220,26 @@ def __call__(
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             latent_model_input = latents
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             t_tensor = paddle.expand(
                 t,
-                [latent_model_input.shape[0], ], )
+                [
+                    latent_model_input.shape[0],
+                ],
+            )
             # predict the noise residual
             noise_pred = self.unet(latent_model_input, t_tensor).sample
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(
-                noise_pred,
-                t,
-                latents,
-                generator=generator,
-                **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, generator=generator, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -281,10 +266,9 @@ def __call__(
             videos_frames.append(video_frames)
 
         if not save_name:
-            save_name = f"defaul_video"
+            save_name = "default_video"
         if not save_dir:
             save_dir = "."
         os.makedirs(save_dir, exist_ok=True)
-        save_results(
-            all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
+        save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=8)
         return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos)
diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py
index 837050f0222df..a969643113c68 100644
--- a/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py
+++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py
@@ -33,12 +33,9 @@
 
     av.logging.set_level(av.logging.ERROR)
     if not hasattr(av.video.frame.VideoFrame, "pict_type"):
-        av = ImportError(
-            """Your version of PyAV is too old for the necessary video operations."""
-        )
+        av = ImportError("""Your version of PyAV is too old for the necessary video operations.""")
 except ImportError:
-    av = ImportError(
-        """PyAV is not installed, and is necessary for the video operations.""")
+    av = ImportError("""PyAV is not installed, and is necessary for the video operations.""")
 
 
 def _check_av_available() -> None:
@@ -47,15 +44,16 @@ def _check_av_available() -> None:
 
 
 def write_video(
-        filename: str,
-        video_array: paddle.Tensor,
-        fps: float,
-        video_codec: str="libx264",
-        options: Optional[Dict[str, Any]]=None,
-        audio_array: Optional[paddle.Tensor]=None,
-        audio_fps: Optional[float]=None,
-        audio_codec: Optional[str]=None,
-        audio_options: Optional[Dict[str, Any]]=None, ) -> None:
+    filename: str,
+    video_array: paddle.Tensor,
+    fps: float,
+    video_codec: str = "libx264",
+    options: Optional[Dict[str, Any]] = None,
+    audio_array: Optional[paddle.Tensor] = None,
+    audio_fps: Optional[float] = None,
+    audio_codec: Optional[str] = None,
+    audio_options: Optional[Dict[str, Any]] = None,
+) -> None:
     """
     Writes a 4d tensor in [T, H, W, C] format in a video file
 
@@ -101,10 +99,8 @@ def write_video(
             audio_layout = "stereo" if num_channels > 1 else "mono"
             audio_sample_fmt = container.streams.audio[0].format.name
             format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
-            audio_array = (
-                paddle.to_tensor(data=audio_array).numpy().astype(format_dtype))
-            frame = av.AudioFrame.from_ndarray(
-                audio_array, format=audio_sample_fmt, layout=audio_layout)
+            audio_array = paddle.to_tensor(data=audio_array).numpy().astype(format_dtype)
+            frame = av.AudioFrame.from_ndarray(audio_array, format=audio_sample_fmt, layout=audio_layout)
             frame.sample_rate = audio_fps
             for packet in a_stream.encode(frame):
                 container.mux(packet)
@@ -121,13 +117,14 @@ def write_video(
 
 @paddle.no_grad()
 def make_grid(
-        tensor: Union[paddle.Tensor, List[paddle.Tensor]],
-        nrow: int=8,
-        padding: int=2,
-        normalize: bool=False,
-        value_range: Optional[Tuple[int, int]]=None,
-        scale_each: bool=False,
-        pad_value: float=0.0, ) -> paddle.Tensor:
+    tensor: Union[paddle.Tensor, List[paddle.Tensor]],
+    nrow: int = 8,
+    padding: int = 2,
+    normalize: bool = False,
+    value_range: Optional[Tuple[int, int]] = None,
+    scale_each: bool = False,
+    pad_value: float = 0.0,
+) -> paddle.Tensor:
     """
     Make a grid of images.
 
@@ -153,12 +150,9 @@ def make_grid(
         if isinstance(tensor, list):
             for t in tensor:
                 if not paddle.is_tensor(x=t):
-                    raise TypeError(
-                        f"tensor or list of tensors expected, got a list containing {type(t)}"
-                    )
+                    raise TypeError(f"tensor or list of tensors expected, got a list containing {type(t)}")
         else:
-            raise TypeError(
-                f"tensor or list of tensors expected, got {type(tensor)}")
+            raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
     if isinstance(tensor, list):
         tensor = paddle.stack(x=tensor, axis=0)
     if tensor.dim() == 2:
@@ -172,9 +166,7 @@ def make_grid(
     if normalize is True:
         tensor = tensor.clone()
         if value_range is not None and not isinstance(value_range, tuple):
-            raise TypeError(
-                "value_range has to be a tuple (min, max) if specified. min and max are numbers"
-            )
+            raise TypeError("value_range has to be a tuple (min, max) if specified. min and max are numbers")
 
         def norm_ip(img, low, high):
             img.clip_(min=low, max=high)
@@ -198,32 +190,33 @@ def norm_range(t, value_range):
     nmaps = tensor.shape[0]
     xmaps = min(nrow, nmaps)
     ymaps = int(math.ceil(float(nmaps) / xmaps))
-    height, width = int(tensor.shape[2] + padding), int(tensor.shape[3] +
-                                                        padding)
+    height, width = int(tensor.shape[2] + padding), int(tensor.shape[3] + padding)
     num_channels = tensor.shape[1]
     grid = paddle.full(
         shape=(num_channels, height * ymaps + padding, width * xmaps + padding),
         fill_value=pad_value,
-        dtype=tensor.dtype, )
+        dtype=tensor.dtype,
+    )
     k = 0
     for y in range(ymaps):
         for x in range(xmaps):
             if k >= nmaps:
                 break
-            start_0 = (grid.shape[1] + y * height + padding
-                       if y * height + padding < 0 else y * height + padding)
-            start_1 = (paddle.slice(grid, [1], [start_0],
-                                    [start_0 + height - padding]).shape[2] + x *
-                       width + padding
-                       if x * width + padding < 0 else x * width + padding)
+            start_0 = grid.shape[1] + y * height + padding if y * height + padding < 0 else y * height + padding
+            start_1 = (
+                paddle.slice(grid, [1], [start_0], [start_0 + height - padding]).shape[2] + x * width + padding
+                if x * width + padding < 0
+                else x * width + padding
+            )
             paddle.assign(
                 tensor[k],
                 output=paddle.slice(
-                    paddle.slice(grid, [1], [start_0],
-                                 [start_0 + height - padding]),
+                    paddle.slice(grid, [1], [start_0], [start_0 + height - padding]),
                     [2],
                     [start_1],
-                    [start_1 + width - padding], ), )
+                    [start_1 + width - padding],
+                ),
+            )
             k = k + 1
     return grid
 
@@ -264,13 +257,12 @@ def to_tensor(pic) -> paddle.Tensor:
         if img.dtype == paddle.uint8:
             return paddle.divide(
                 img.cast(default_float_dtype),
-                paddle.to_tensor(
-                    255, dtype=paddle.float32), )
+                paddle.to_tensor(255, dtype=paddle.float32),
+            )
         else:
             return img
     mode_to_nptype = {"I": np.int32, "I;16": np.int16, "F": np.float32}
-    img = paddle.to_tensor(data=np.array(
-        pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
+    img = paddle.to_tensor(data=np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
     if pic.mode == "1":
         img = 255 * img
     img = img.reshape([pic.size[1], pic.size[0], get_image_num_channels(pic)])
@@ -299,20 +291,21 @@ def fill_with_black_squares(video, desired_len: int) -> paddle.Tensor:
     return paddle.concat(
         x=[
             video,
-            paddle.zeros_like(x=video[0]).unsqueeze(axis=0)
-            .tile(repeat_times=[desired_len - len(video), 1, 1, 1]),
+            paddle.zeros_like(x=video[0]).unsqueeze(axis=0).tile(repeat_times=[desired_len - len(video), 1, 1, 1]),
         ],
-        axis=0, )
+        axis=0,
+    )
 
 
 def npz_to_video_grid(
-        data_path,
-        out_path,
-        num_frames=None,
-        fps=8,
-        num_videos=None,
-        nrow=None,
-        verbose=True, ):
+    data_path,
+    out_path,
+    num_frames=None,
+    fps=8,
+    num_videos=None,
+    nrow=None,
+    verbose=True,
+):
     if isinstance(data_path, str):
         videos = load_num_videos(data_path, num_videos)
     elif isinstance(data_path, np.ndarray):
@@ -332,22 +325,14 @@ def npz_to_video_grid(
     if num_frames is None:
         num_frames = videos.shape[1]
     if verbose:
-        videos = [
-            fill_with_black_squares(v, num_frames)
-            for v in tqdm(
-                videos_th, desc="Adding empty frames")
-        ]
+        videos = [fill_with_black_squares(v, num_frames) for v in tqdm(videos_th, desc="Adding empty frames")]
     else:
         videos = [fill_with_black_squares(v, num_frames) for v in videos_th]
     frame_grids = paddle.stack(x=videos).transpose(perm=[1, 0, 2, 3, 4])
     if nrow is None:
         nrow = int(np.ceil(np.sqrt(n)))
     if verbose:
-        frame_grids = [
-            make_grid(
-                fs, nrow=nrow) for fs in tqdm(
-                    frame_grids, desc="Making grids")
-        ]
+        frame_grids = [make_grid(fs, nrow=nrow) for fs in tqdm(frame_grids, desc="Making grids")]
 
     else:
         frame_grids = [make_grid(fs, nrow=nrow) for fs in frame_grids]
@@ -356,21 +341,14 @@ def npz_to_video_grid(
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
     if isinstance("uint8", paddle.dtype):
         dtype = "uint8"
-    elif isinstance("uint8",
-                    str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
+    elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
         dtype = "uint8"
     elif isinstance("uint8", paddle.Tensor):
         dtype = "uint8".dtype
     else:
         dtype = (paddle.stack(x=frame_grids) * 255).dtype
-    frame_grids = ((paddle.stack(x=frame_grids) * 255).transpose(
-        perm=[0, 2, 3, 1]).cast(dtype))
-    write_video(
-        out_path,
-        frame_grids,
-        fps=fps,
-        video_codec="h264",
-        options={"crf": "10"})
+    frame_grids = (paddle.stack(x=frame_grids) * 255).transpose(perm=[0, 2, 3, 1]).cast(dtype)
+    write_video(out_path, frame_grids, fps=fps, video_codec="h264", options={"crf": "10"})
 
 
 def savenp2sheet(imgs, savepath, nrow=None):
@@ -398,10 +376,7 @@ def savenp2sheet(imgs, savepath, nrow=None):
     n_rows = int(np.ceil(n / n_cols))
     print(n_cols)
     print(n_rows)
-    imgsheet = cv2.vconcat([
-        cv2.hconcat(imgs_new[i * n_cols:(i + 1) * n_cols])
-        for i in range(n_rows)
-    ])
+    imgsheet = cv2.vconcat([cv2.hconcat(imgs_new[i * n_cols : (i + 1) * n_cols]) for i in range(n_rows)])
     cv2.imwrite(savepath, imgsheet)
     print(f"saved in {savepath}")
 
@@ -414,7 +389,7 @@ def npz_to_imgsheet_5d(data_path, res_dir, nrow=None):
     else:
         raise Exception
     if os.path.isdir(res_dir):
-        res_path = os.path.join(res_dir, f"samples.jpg")
+        res_path = os.path.join(res_dir, "samples.jpg")
     else:
         assert res_dir.endswith(".jpg")
         res_path = res_dir
@@ -423,24 +398,25 @@ def npz_to_imgsheet_5d(data_path, res_dir, nrow=None):
 
 
 def save_results(
-        videos,
-        save_dir,
-        save_name="results",
-        save_fps=8,
-        save_mp4=True,
-        save_npz=False,
-        save_mp4_sheet=False,
-        save_jpg=False, ):
+    videos,
+    save_dir,
+    save_name="results",
+    save_fps=8,
+    save_mp4=True,
+    save_npz=False,
+    save_mp4_sheet=False,
+    save_jpg=False,
+):
     if save_mp4:
         save_subdir = os.path.join(save_dir, "videos")
         os.makedirs(save_subdir, exist_ok=True)
         shape_str = "x".join([str(x) for x in videos[0:1, (...)].shape])
         for i in range(videos.shape[0]):
             npz_to_video_grid(
-                videos[i:i + 1, (...)],
-                os.path.join(save_subdir,
-                             f"{save_name}_{i:03d}_{shape_str}.mp4"),
-                fps=save_fps, )
+                videos[i : i + 1, (...)],
+                os.path.join(save_subdir, f"{save_name}_{i:03d}_{shape_str}.mp4"),
+                fps=save_fps,
+            )
         print(f"Successfully saved videos in {save_subdir}")
     shape_str = "x".join([str(x) for x in videos.shape])
     if save_npz:
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
index 4ba5c5d72ec8e..713ed5d8191b5 100644
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 import paddle
 from paddle import nn
-from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig,
-                                    CLIPVisionModel)
+from paddlenlp.transformers import (
+    CLIPPretrainedModel,
+    CLIPVisionConfig,
+    CLIPVisionModel,
+)
 
 from ...models.attention import BasicTransformerBlock
 from ...utils import logging
@@ -42,8 +45,8 @@ def __init__(self, config: CLIPVisionConfig, proj_size=None):
         self.uncond_vector = self.create_parameter(
             [1, 1, self.projection_dim],
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Assign(
-                paddle.rand((1, 1, self.projection_dim))), )
+            default_initializer=nn.initializer.Assign(paddle.rand((1, 1, self.projection_dim))),
+        )
 
     def forward(self, pixel_values, return_uncond_vector=False):
         clip_output = self.model(pixel_values=pixel_values)
@@ -63,14 +66,18 @@ def __init__(self, config: CLIPVisionConfig):
         num_layers = (config.num_hidden_layers + 1) // 5
         hid_size = config.hidden_size
         num_heads = 1
-        self.blocks = nn.LayerList([
-            BasicTransformerBlock(
-                hid_size,
-                num_heads,
-                hid_size,
-                activation_fn="gelu",
-                attention_bias=True, ) for _ in range(num_layers)
-        ])
+        self.blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    hid_size,
+                    num_heads,
+                    hid_size,
+                    activation_fn="gelu",
+                    attention_bias=True,
+                )
+                for _ in range(num_layers)
+            ]
+        )
 
     def forward(self, hidden_states):
         for block in self.blocks:
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index f6b679c76b433..8ed3770065a18 100644
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -62,14 +62,11 @@ def prepare_mask_and_masked_image(image, mask):
     """
     if isinstance(image, paddle.Tensor):
         if not isinstance(mask, paddle.Tensor):
-            raise TypeError(
-                f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
-            )
+            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
 
         # Batch single image
         if image.ndim == 3:
-            assert (image.shape[0] == 3
-                    ), "Image outside a batch should be of shape (3, H, W)"
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
             image = image.unsqueeze(0)
 
         # Batch and add channel dim for single mask
@@ -84,12 +81,9 @@ def prepare_mask_and_masked_image(image, mask):
             else:
                 mask = mask.unsqueeze(0)
 
-        assert (image.ndim == 4 and
-                mask.ndim == 4), "Image and Mask must have 4 dimensions"
-        assert (image.shape[-2:] == mask.shape[-2:]
-                ), "Image and Mask must have the same spatial dimensions"
-        assert (image.shape[0] == mask.shape[0]
-                ), "Image and Mask must have the same batch size"
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
         assert mask.shape[1] == 1, "Mask image must have a single channel"
 
         # Check image is in [-1, 1]
@@ -109,14 +103,12 @@ def prepare_mask_and_masked_image(image, mask):
         # Image as float32
         image = image.cast(paddle.float32)
     elif isinstance(mask, paddle.Tensor):
-        raise TypeError(
-            f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
     else:
         if isinstance(image, PIL.Image.Image):
             image = [image]
 
-        image = np.concatenate(
-            [np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
+        image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
         image = image.transpose(0, 3, 1, 2)
         image = paddle.to_tensor(image).cast(paddle.float32) / 127.5 - 1.0
 
@@ -124,8 +116,7 @@ def prepare_mask_and_masked_image(image, mask):
         if isinstance(mask, PIL.Image.Image):
             mask = [mask]
 
-        mask = np.concatenate(
-            [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
         mask = mask.astype(np.float32) / 255.0
 
         # paint-by-example inverses the mask
@@ -170,15 +161,15 @@ class PaintByExamplePipeline(DiffusionPipeline):
     _optional_components = ["safety_checker"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            image_encoder: PaintByExampleImageEncoder,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler,
-                             LMSDiscreteScheduler],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae: AutoencoderKL,
+        image_encoder: PaintByExampleImageEncoder,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -187,18 +178,18 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -210,15 +201,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -234,40 +223,44 @@ def decode_latents(self, latents):
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
     def check_inputs(self, image, height, width, callback_steps):
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}")
+                f" {type(image)}"
+            )
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -283,22 +276,22 @@ def prepare_latents(
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
     def prepare_mask_latents(
-            self,
-            mask,
-            masked_image,
-            batch_size,
-            height,
-            width,
-            dtype,
-            generator,
-            do_classifier_free_guidance, ):
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+        generator,
+        do_classifier_free_guidance,
+    ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
         mask = paddle.nn.functional.interpolate(
-            mask,
-            size=(height // self.vae_scale_factor,
-                  width // self.vae_scale_factor))
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
         mask = mask.cast(dtype)
 
         masked_image = masked_image.cast(dtype)
@@ -306,13 +299,12 @@ def prepare_mask_latents(
         # encode the mask image into latents space so we can concatenate it to the latents
         if isinstance(generator, list):
             masked_image_latents = [
-                self.vae.encode(masked_image[i:i + 1]).latent_dist.sample(
-                    generator=generator[i]) for i in range(batch_size)
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
             ]
             masked_image_latents = paddle.concat(masked_image_latents, axis=0)
         else:
-            masked_image_latents = self.vae.encode(
-                masked_image).latent_dist.sample(generator=generator)
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
         masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
 
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -331,71 +323,62 @@ def prepare_mask_latents(
                     f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
                     " Make sure the number of images that you pass is divisible by the total requested batch size."
                 )
-            masked_image_latents = masked_image_latents.tile(
-                [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
 
-        mask = paddle.concat([mask] *
-                             2) if do_classifier_free_guidance else mask
-        masked_image_latents = (paddle.concat([masked_image_latents] * 2)
-                                if do_classifier_free_guidance else
-                                masked_image_latents)
+        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
 
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.cast(dtype)
         return mask, masked_image_latents
 
-    def _encode_image(self, image, num_images_per_prompt,
-                      do_classifier_free_guidance):
+    def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance):
         dtype = self.image_encoder.dtype
 
         if not isinstance(image, paddle.Tensor):
-            image = self.feature_extractor(
-                images=image, return_tensors="pd").pixel_values
+            image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
 
         image = image.cast(dtype)
-        image_embeddings, negative_prompt_embeds = self.image_encoder(
-            image, return_uncond_vector=True)
+        image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
 
         # duplicate image embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = image_embeddings.shape
         image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, image_embeddings.shape[0], 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [bs_embed * num_images_per_prompt, 1, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, image_embeddings.shape[0], 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([bs_embed * num_images_per_prompt, 1, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            image_embeddings = paddle.concat(
-                [negative_prompt_embeds, image_embeddings])
+            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
 
         return image_embeddings
 
     @paddle.no_grad()
     def __call__(
-            self,
-            example_image: Union[paddle.Tensor, PIL.Image.Image],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            mask_image: Union[paddle.Tensor, PIL.Image.Image],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=5.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        example_image: Union[paddle.Tensor, PIL.Image.Image],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        mask_image: Union[paddle.Tensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -477,8 +460,7 @@ def __call__(
         self.check_inputs(example_image, height, width, callback_steps)
 
         # 4. Encode input image
-        image_embeddings = self._encode_image(
-            example_image, num_images_per_prompt, do_classifier_free_guidance)
+        image_embeddings = self._encode_image(example_image, num_images_per_prompt, do_classifier_free_guidance)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -493,7 +475,8 @@ def __call__(
             width,
             image_embeddings.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Prepare mask latent variables
         mask, masked_image_latents = self.prepare_mask_latents(
@@ -504,60 +487,50 @@ def __call__(
             width,
             image_embeddings.dtype,
             generator,
-            do_classifier_free_guidance, )
+            do_classifier_free_guidance,
+        )
 
         # 8. Check that sizes of mask, masked image and latents match
         num_channels_mask = mask.shape[1]
         num_channels_masked_image = masked_image_latents.shape[1]
-        if (num_channels_latents + num_channels_mask + num_channels_masked_image
-                != self.unet.config.in_channels):
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
             raise ValueError(
                 f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                 f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                 f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
                 f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                " `pipeline.unet` or your `mask_image` or `image` input.")
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
 
         # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 10. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
 
                 # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                latent_model_input = paddle.concat(
-                    [latent_model_input, masked_image_latents, mask], axis=1)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = paddle.concat([latent_model_input, masked_image_latents, mask], axis=1)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=image_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # must cast this, paddle.concat has bug...
                 latents = latents.cast(image_embeddings.dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -566,8 +539,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 12. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(
-            image, image_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype)
 
         # 13. Convert to PIL
         if output_type == "pil":
@@ -576,5 +548,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
index 9c25c86f78f6a..b51612c302879 100644
--- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
@@ -30,9 +30,15 @@
 import numpy as np
 import PIL
 import PIL.Image
-from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_url,
-                             model_info, repo_type_and_id_from_hf_id,
-                             snapshot_download, upload_folder)
+from huggingface_hub import (
+    create_repo,
+    get_hf_file_metadata,
+    hf_hub_url,
+    model_info,
+    repo_type_and_id_from_hf_id,
+    snapshot_download,
+    upload_folder,
+)
 from huggingface_hub.utils import EntryNotFoundError
 from packaging import version
 from tqdm.auto import tqdm
@@ -40,13 +46,31 @@
 from ..configuration_utils import ConfigMixin
 from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
-    CONFIG_NAME, DEPRECATED_REVISION_ARGS, DIFFUSERS_CACHE, FLAX_WEIGHTS_NAME,
-    FROM_DIFFUSERS, FROM_HF_HUB, HF_HUB_OFFLINE, LOW_CPU_MEM_USAGE_DEFAULT,
-    ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PPDIFFUSERS_CACHE,
-    TO_DIFFUSERS, TORCH_SAFETENSORS_WEIGHTS_NAME, TORCH_WEIGHTS_NAME,
-    BaseOutput, deprecate, get_class_from_dynamic_module, is_paddle_available,
-    is_paddlenlp_available, is_safetensors_available, logging, numpy_to_pil,
-    ppdiffusers_bos_dir_download, ppdiffusers_url_download)
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    DIFFUSERS_CACHE,
+    FLAX_WEIGHTS_NAME,
+    FROM_DIFFUSERS,
+    FROM_HF_HUB,
+    HF_HUB_OFFLINE,
+    LOW_CPU_MEM_USAGE_DEFAULT,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
+    PPDIFFUSERS_CACHE,
+    TO_DIFFUSERS,
+    TORCH_SAFETENSORS_WEIGHTS_NAME,
+    TORCH_WEIGHTS_NAME,
+    BaseOutput,
+    deprecate,
+    get_class_from_dynamic_module,
+    is_paddle_available,
+    is_paddlenlp_available,
+    is_safetensors_available,
+    logging,
+    numpy_to_pil,
+    ppdiffusers_bos_dir_download,
+    ppdiffusers_url_download,
+)
 from ..version import VERSION as __version__
 
 if is_paddle_available():
@@ -133,8 +157,7 @@ class AudioPipelineOutput(BaseOutput):
     audios: np.ndarray
 
 
-def is_safetensors_compatible(filenames, variant=None,
-                              passed_components=None) -> bool:
+def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
     """
     Checking for safetensors compatibility:
     - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
@@ -154,8 +177,7 @@ def is_safetensors_compatible(filenames, variant=None,
     for filename in filenames:
         _, extension = os.path.splitext(filename)
 
-        if (len(filename.split("/")) == 2 and
-                filename.split("/")[0] in passed_components):
+        if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
             continue
 
         if extension == ".bin":
@@ -183,8 +205,7 @@ def is_safetensors_compatible(filenames, variant=None,
     return True
 
 
-def variant_compatible_siblings(filenames,
-                                variant=None) -> Union[List[os.PathLike], str]:
+def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
     weight_names = [
         TORCH_WEIGHTS_NAME,
         TORCH_SAFETENSORS_WEIGHTS_NAME,
@@ -217,35 +238,17 @@ def variant_compatible_siblings(filenames,
         rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
     )
     # `text_encoder/pytorch_model.bin.index.json`
-    non_variant_index_re = re.compile(
-        rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json"
-    )
+    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
 
     if variant is not None:
-        variant_weights = {
-            f
-            for f in filenames
-            if variant_file_re.match(f.split("/")[-1]) is not None
-        }
-        variant_indexes = {
-            f
-            for f in filenames
-            if variant_index_re.match(f.split("/")[-1]) is not None
-        }
+        variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
+        variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
         variant_filenames = variant_weights | variant_indexes
     else:
         variant_filenames = set()
 
-    non_variant_weights = {
-        f
-        for f in filenames
-        if non_variant_file_re.match(f.split("/")[-1]) is not None
-    }
-    non_variant_indexes = {
-        f
-        for f in filenames
-        if non_variant_index_re.match(f.split("/")[-1]) is not None
-    }
+    non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
+    non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
     non_variant_filenames = non_variant_weights | non_variant_indexes
 
     # all variant filenames will be used by default
@@ -254,12 +257,10 @@ def variant_compatible_siblings(filenames,
     def convert_to_variant(filename):
         if "index" in filename:
             variant_filename = filename.replace("index", f"index.{variant}")
-        elif (re.compile(f"^(.*?){transformers_index_format}").match(filename)
-              is not None):
+        elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
             variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
         else:
-            variant_filename = (
-                f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}")
+            variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
         return variant_filename
 
     for f in non_variant_filenames:
@@ -270,51 +271,46 @@ def convert_to_variant(filename):
     return usable_filenames, variant_filenames
 
 
-def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token,
-                                  variant, revision, model_filenames):
+def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token, variant, revision, model_filenames):
     info = model_info(
         pretrained_model_name_or_path,
         use_auth_token=use_auth_token,
-        revision=None, )
+        revision=None,
+    )
     filenames = {sibling.rfilename for sibling in info.siblings}
-    comp_model_filenames, _ = variant_compatible_siblings(
-        filenames, variant=revision)
-    comp_model_filenames = [
-        ".".join(f.split(".")[:1] + f.split(".")[2:])
-        for f in comp_model_filenames
-    ]
+    comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
+    comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
 
     if set(comp_model_filenames) == set(model_filenames):
         warnings.warn(
             f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
-            FutureWarning, )
+            FutureWarning,
+        )
     else:
         warnings.warn(
             f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
-            FutureWarning, )
+            FutureWarning,
+        )
 
 
 def maybe_raise_or_warn(
-        library_name,
-        library,
-        class_name,
-        importable_classes,
-        passed_class_obj,
-        name,
-        is_pipeline_module, ):
+    library_name,
+    library,
+    class_name,
+    importable_classes,
+    passed_class_obj,
+    name,
+    is_pipeline_module,
+):
     """Simple helper method to raise or warn in case incorrect module has been passed"""
     if not is_pipeline_module:
         library = importlib.import_module(library_name)
         class_obj = getattr(library, class_name)
-        class_candidates = {
-            c: getattr(library, c, None)
-            for c in importable_classes.keys()
-        }
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
         expected_class_obj = None
         for class_name, class_candidate in class_candidates.items():
-            if class_candidate is not None and issubclass(class_obj,
-                                                          class_candidate):
+            if class_candidate is not None and issubclass(class_obj, class_candidate):
                 expected_class_obj = class_candidate
 
         # Dynamo wraps the original model in a private class.
@@ -325,15 +321,16 @@ def maybe_raise_or_warn(
         if not issubclass(model_cls, expected_class_obj):
             raise ValueError(
                 f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
-                f" {expected_class_obj}")
+                f" {expected_class_obj}"
+            )
     else:
         logger.warning(
             f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
-            " has the correct type")
+            " has the correct type"
+        )
 
 
-def get_class_obj_and_candidates(library_name, class_name, importable_classes,
-                                 pipelines, is_pipeline_module):
+def get_class_obj_and_candidates(library_name, class_name, importable_classes, pipelines, is_pipeline_module):
     """Simple helper method to retrieve class object of module as well as potential parent class objects"""
     if is_pipeline_module:
         pipeline_module = getattr(pipelines, library_name)
@@ -344,19 +341,12 @@ def get_class_obj_and_candidates(library_name, class_name, importable_classes,
         # else we just import it from the library.
         library = importlib.import_module(library_name)
         class_obj = getattr(library, class_name)
-        class_candidates = {
-            c: getattr(library, c, None)
-            for c in importable_classes.keys()
-        }
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
     return class_obj, class_candidates
 
 
-def _get_pipeline_class(class_obj,
-                        config,
-                        custom_pipeline=None,
-                        cache_dir=None,
-                        revision=None):
+def _get_pipeline_class(class_obj, config, custom_pipeline=None, cache_dir=None, revision=None):
     if custom_pipeline is not None:
         if custom_pipeline.endswith(".py"):
             path = Path(custom_pipeline)
@@ -370,31 +360,32 @@ def _get_pipeline_class(class_obj,
             custom_pipeline,
             module_file=file_name,
             cache_dir=cache_dir,
-            revision=revision, )
+            revision=revision,
+        )
 
     if class_obj != DiffusionPipeline:
         return class_obj
 
-    ppdiffusers_module = importlib.import_module(
-        class_obj.__module__.split(".")[0])
+    ppdiffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
     return getattr(ppdiffusers_module, config["_class_name"])
 
 
 def load_sub_model(
-        library_name: str,
-        class_name: str,
-        importable_classes: List[Any],
-        pipelines: Any,
-        is_pipeline_module: bool,
-        pipeline_class: Any,
-        paddle_dtype: paddle.dtype,
-        runtime_options: Any,
-        model_variants: Dict[str, str],
-        name: str,
-        from_diffusers: bool,
-        low_cpu_mem_usage: bool=False,
-        cached_folder: Union[str, os.PathLike]=None,
-        **kwargs, ):
+    library_name: str,
+    class_name: str,
+    importable_classes: List[Any],
+    pipelines: Any,
+    is_pipeline_module: bool,
+    pipeline_class: Any,
+    paddle_dtype: paddle.dtype,
+    runtime_options: Any,
+    model_variants: Dict[str, str],
+    name: str,
+    from_diffusers: bool,
+    low_cpu_mem_usage: bool = False,
+    cached_folder: Union[str, os.PathLike] = None,
+    **kwargs,
+):
     # support huggingface diffusers onnx model
     is_onnx_model = False
     if "Onnx" in class_name:
@@ -403,29 +394,29 @@ def load_sub_model(
     """Helper method to load the module `name` from `library_name` and `class_name`"""
     # retrieve class candidates
     class_obj, class_candidates = get_class_obj_and_candidates(
-        library_name, class_name, importable_classes, pipelines,
-        is_pipeline_module)
+        library_name, class_name, importable_classes, pipelines, is_pipeline_module
+    )
 
     load_method_name = None
     # retrive load method name
     for class_name, class_candidate in class_candidates.items():
-        if class_candidate is not None and issubclass(class_obj,
-                                                      class_candidate):
+        if class_candidate is not None and issubclass(class_obj, class_candidate):
             load_method_name = importable_classes[class_name][1]
 
     # if load method name is None, then we have a dummy module -> raise Error
     if load_method_name is None:
         none_module = class_obj.__module__
-        is_dummy_path = none_module.startswith(
-            DUMMY_MODULES_FOLDER) or none_module.startswith(
-                PADDLENLP_DUMMY_MODULES_FOLDER)
+        is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
+            PADDLENLP_DUMMY_MODULES_FOLDER
+        )
         if is_dummy_path and "dummy" in none_module:
             # call class_obj for nice error message of missing requirements
             class_obj()
 
         raise ValueError(
             f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
-            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}.")
+            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
+        )
 
     load_method = getattr(class_obj, load_method_name)
 
@@ -435,17 +426,17 @@ def load_sub_model(
     # FastDeploy Model
     if issubclass(class_obj, FastDeployRuntimeModel):
         loading_kwargs["runtime_options"] = (
-            runtime_options.get(name, None)
-            if isinstance(runtime_options, dict) else runtime_options)
+            runtime_options.get(name, None) if isinstance(runtime_options, dict) else runtime_options
+        )
         if not is_onnx_model:
             if os.path.isdir(os.path.join(cached_folder, name)):
                 is_onnx_model = any(
-                    d.endswith(".onnx") or d.endswith(".pb")
-                    for d in os.listdir(os.path.join(cached_folder, name)))
+                    d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder, name))
+                )
             else:
                 is_onnx_model = any(
-                    d.endswith(".onnx") or d.endswith(".pb")
-                    for d in os.listdir(os.path.join(cached_folder)))
+                    d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder))
+                )
         loading_kwargs["is_onnx_model"] = is_onnx_model
 
     from ppdiffusers import ModelMixin
@@ -461,8 +452,7 @@ def load_sub_model(
     try:
         # check if the module is in a subdirectory
         if os.path.isdir(os.path.join(cached_folder, name)):
-            loaded_sub_model = load_method(
-                os.path.join(cached_folder, name), **loading_kwargs)
+            loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
         else:
             # else load from the root directory
             loaded_sub_model = load_method(cached_folder, **loading_kwargs)
@@ -478,11 +468,10 @@ def load_sub_model(
             loaded_sub_model = load_method(
                 pretrained_model_name_or_path + "/" + name,
                 cache_dir=cache_dir,
-                **loading_kwargs, )
-        if loaded_sub_model is None:
-            raise ValueError(
-                f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} "
+                **loading_kwargs,
             )
+        if loaded_sub_model is None:
+            raise ValueError(f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} ")
 
     return loaded_sub_model
 
@@ -517,19 +506,15 @@ def register_modules(self, **kwargs):
                 register_dict = {name: (None, None)}
             else:
                 # TODO (junnyu) support paddlenlp.transformers
-                if "paddlenlp" in module.__module__.split(
-                        ".") or "ppnlp_patch_utils" in module.__module__.split(
-                            "."):
+                if "paddlenlp" in module.__module__.split(".") or "ppnlp_patch_utils" in module.__module__.split("."):
                     library = "paddlenlp.transformers"
                 else:
                     library = module.__module__.split(".")[0]
 
                 # check if the module is a pipeline module
-                pipeline_dir = (module.__module__.split(".")[-2] if
-                                len(module.__module__.split(".")) > 2 else None)
+                pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None
                 path = module.__module__.split(".")
-                is_pipeline_module = pipeline_dir in path and hasattr(
-                    pipelines, pipeline_dir)
+                is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
 
                 # if library is not in LOADABLE_CLASSES, then it is a custom module.
                 # Or if it's a pipeline module, then the module is inside the pipeline
@@ -549,19 +534,20 @@ def register_modules(self, **kwargs):
             setattr(self, name, module)
 
             # TODO junnyu, before register model, we may need to keep some module in fp32
-            if (isinstance(module, nn.Layer) and
-                    hasattr(module, "_keep_in_fp32_modules") and
-                    module.dtype == paddle.float16 and
-                    module._keep_in_fp32_modules is not None):
-                for module_name, sub_module in module.named_sublayers(
-                        include_self=True):
-                    if any(n in module_name
-                           for n in module._keep_in_fp32_modules):
+            if (
+                isinstance(module, nn.Layer)
+                and hasattr(module, "_keep_in_fp32_modules")
+                and module.dtype == paddle.float16
+                and module._keep_in_fp32_modules is not None
+            ):
+                for module_name, sub_module in module.named_sublayers(include_self=True):
+                    if any(n in module_name for n in module._keep_in_fp32_modules):
                         sub_module.to(dtype=paddle.float32)
                         if hasattr(sub_module, "pre_hook"):
                             sub_module.pre_hook.remove()
                         sub_module.pre_hook = sub_module.register_forward_pre_hook(
-                            lambda layer, input: input[0].cast("float32"))
+                            lambda layer, input: input[0].cast("float32")
+                        )
 
     def __setattr__(self, name: str, value: Any):
         if name in self.__dict__ and hasattr(self.config, name):
@@ -570,7 +556,8 @@ def __setattr__(self, name: str, value: Any):
                 if value is not None and self.config[name][0] is not None:
                     class_library_tuple = (
                         value.__module__.split(".")[0],
-                        value.__class__.__name__, )
+                        value.__class__.__name__,
+                    )
                 else:
                     class_library_tuple = (None, None)
 
@@ -581,11 +568,12 @@ def __setattr__(self, name: str, value: Any):
         super().__setattr__(name, value)
 
     def save_pretrained(
-            self,
-            save_directory: Union[str, os.PathLike],
-            safe_serialization: bool=False,
-            variant: Optional[str]=None,
-            to_diffusers: bool=None, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        to_diffusers: bool = None,
+    ):
         """
         Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
         a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
@@ -619,10 +607,7 @@ def is_saveable_module(name, value):
                 return False
             return True
 
-        model_index_dict = {
-            k: v
-            for k, v in model_index_dict.items() if is_saveable_module(k, v)
-        }
+        model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
 
         for pipeline_component_name in model_index_dict.keys():
             sub_model = getattr(self, pipeline_component_name)
@@ -639,8 +624,7 @@ def is_saveable_module(name, value):
                     )
                 for base_class, save_load_methods in library_classes.items():
                     class_candidate = getattr(library, base_class, None)
-                    if class_candidate is not None and issubclass(
-                            model_cls, class_candidate):
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
                         # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
                         save_method_name = save_load_methods[0]
                         break
@@ -648,23 +632,18 @@ def is_saveable_module(name, value):
                     break
 
             if save_method_name is None:
-                logger.warn(
-                    f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved."
-                )
+                logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.")
                 # make sure that unsaveable components are not tried to be loaded afterward
-                self.register_to_config(
-                    **{pipeline_component_name: (None, None)})
+                self.register_to_config(**{pipeline_component_name: (None, None)})
                 continue
 
             save_method = getattr(sub_model, save_method_name)
 
             # Call the save method with the argument safe_serialization only if it's supported
             save_method_signature = inspect.signature(save_method)
-            save_method_accept_safe = (
-                "safe_serialization" in save_method_signature.parameters)
+            save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
             save_method_accept_variant = "variant" in save_method_signature.parameters
-            save_method_accept_to_diffusers = (
-                "to_diffusers" in save_method_signature.parameters)
+            save_method_accept_to_diffusers = "to_diffusers" in save_method_signature.parameters
 
             save_kwargs = {}
             # maybe we donot have torch so we use safe_serialization
@@ -678,20 +657,19 @@ def is_saveable_module(name, value):
             if save_method_accept_to_diffusers:
                 save_kwargs["to_diffusers"] = to_diffusers
 
-            save_method(
-                os.path.join(save_directory, pipeline_component_name),
-                **save_kwargs)
+            save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
 
         # finally save the config
         self.save_config(save_directory, to_diffusers=to_diffusers)
 
     def save_to_hf_hub(
-            self,
-            repo_id: str,
-            private: Optional[bool]=None,
-            commit_message: Optional[str]=None,
-            revision: Optional[str]=None,
-            create_pr: bool=False, ):
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
         """
         Uploads all elements of this pipeline to a new HuggingFace Hub repository.
         Args:
@@ -715,9 +693,7 @@ def save_to_hf_hub(
 
         # Check if README file already exist in repo
         try:
-            get_hf_file_metadata(
-                hf_hub_url(
-                    repo_id=repo_id, filename="README.md", revision=revision))
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
             has_readme = True
         except EntryNotFoundError:
             has_readme = False
@@ -739,13 +715,15 @@ def save_to_hf_hub(
                 folder_path=tmp_dir,
                 commit_message=commit_message,
                 revision=revision,
-                create_pr=create_pr, )
+                create_pr=create_pr,
+            )
 
     def to(
-            self,
-            paddle_device: Optional[str]=None,
-            paddle_dtype: Optional[paddle.dtype]=None,
-            silence_dtype_warnings: bool=True, ):
+        self,
+        paddle_device: Optional[str] = None,
+        paddle_dtype: Optional[paddle.dtype] = None,
+        silence_dtype_warnings: bool = True,
+    ):
         if paddle_device is None and paddle_dtype is None:
             return self
 
@@ -753,9 +731,12 @@ def to(
         modules = [getattr(self, n, None) for n in module_names]
         modules = [m for m in modules if isinstance(m, nn.Layer)]
         for module in modules:
-            if (paddle_device is not None and module.dtype == paddle.float16 and
-                    str(paddle_device) in ["cpu"] and
-                    not silence_dtype_warnings):
+            if (
+                paddle_device is not None
+                and module.dtype == paddle.float16
+                and str(paddle_device) in ["cpu"]
+                and not silence_dtype_warnings
+            ):
                 logger.warning(
                     "Pipelines loaded with `paddle_dtype=paddle.float16` cannot run with `cpu` device. It"
                     " is not recommended to move them to `cpu` as running them will fail. Please make"
@@ -771,19 +752,20 @@ def to(
             module.to(**kwargs)
 
             # TODO junnyu, before register model, we may need to keep some module in fp32
-            if (isinstance(module, nn.Layer) and
-                    hasattr(module, "_keep_in_fp32_modules") and
-                    module.dtype == paddle.float16 and
-                    module._keep_in_fp32_modules is not None):
-                for module_name, sub_module in module.named_sublayers(
-                        include_self=True):
-                    if any(n in module_name
-                           for n in module._keep_in_fp32_modules):
+            if (
+                isinstance(module, nn.Layer)
+                and hasattr(module, "_keep_in_fp32_modules")
+                and module.dtype == paddle.float16
+                and module._keep_in_fp32_modules is not None
+            ):
+                for module_name, sub_module in module.named_sublayers(include_self=True):
+                    if any(n in module_name for n in module._keep_in_fp32_modules):
                         sub_module.to(dtype=paddle.float32)
                         if hasattr(sub_module, "pre_hook"):
                             sub_module.pre_hook.remove()
                         sub_module.pre_hook = sub_module.register_forward_pre_hook(
-                            lambda layer, input: input[0].cast("float32"))
+                            lambda layer, input: input[0].cast("float32")
+                        )
         return self
 
     @property
@@ -801,10 +783,7 @@ def device(self):
         return "cpu"
 
     @classmethod
-    def from_pretrained(
-            cls,
-            pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
-            **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         r"""
         Instantiate a Paddle diffusion pipeline from pre-trained pipeline weights.
 
@@ -964,18 +943,17 @@ def from_pretrained(
         custom_pipeline = kwargs.pop("custom_pipeline", None)
         custom_revision = kwargs.pop("custom_revision", None)
         runtime_options = kwargs.pop("runtime_options", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage",
-                                       LOW_CPU_MEM_USAGE_DEFAULT)
-        use_safetensors = kwargs.pop("use_safetensors", None
-                                     if is_safetensors_available() else False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
         variant = kwargs.pop("variant", None)
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
 
         # deperate
         return_cached_folder = kwargs.pop("return_cached_folder", False)
 
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
 
         load_sub_model_kwargs = {
             "pretrained_model_name_or_path": pretrained_model_name_or_path,
@@ -1003,7 +981,8 @@ def from_pretrained(
                 variant=variant,
                 from_hf_hub=from_hf_hub,
                 from_diffusers=from_diffusers,
-                **kwargs, )
+                **kwargs,
+            )
         else:
             # is_local_dir
             load_sub_model_kwargs["is_local_dir"] = True
@@ -1023,8 +1002,8 @@ def from_pretrained(
                 folder_path = os.path.join(cached_folder, folder)
                 is_folder = os.path.isdir(folder_path) and folder in config_dict
                 variant_exists = is_folder and any(
-                    p.split(".")[1].startswith(variant)
-                    for p in os.listdir(folder_path))
+                    p.split(".")[1].startswith(variant) for p in os.listdir(folder_path)
+                )
                 if variant_exists:
                     model_variants[folder] = variant
 
@@ -1035,18 +1014,22 @@ def from_pretrained(
             config_dict,
             custom_pipeline=custom_pipeline,
             cache_dir=cache_dir,
-            revision=custom_revision, )
+            revision=custom_revision,
+        )
 
         # DEPRECATED: To be removed in 1.0.0
-        _ppdiffusers_version = (config_dict["_diffusers_paddle_version"]
-                                if "_diffusers_paddle_version" in config_dict
-                                else config_dict["_ppdiffusers_version"])
-        if (pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and
-                version.parse(
-                    version.parse(_ppdiffusers_version).base_version) <=
-                version.parse("0.5.1")):
-            from ppdiffusers import (StableDiffusionInpaintPipeline,
-                                     StableDiffusionInpaintPipelineLegacy)
+        _ppdiffusers_version = (
+            config_dict["_diffusers_paddle_version"]
+            if "_diffusers_paddle_version" in config_dict
+            else config_dict["_ppdiffusers_version"]
+        )
+        if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+            version.parse(_ppdiffusers_version).base_version
+        ) <= version.parse("0.5.1"):
+            from ppdiffusers import (
+                StableDiffusionInpaintPipeline,
+                StableDiffusionInpaintPipelineLegacy,
+            )
 
             pipeline_class = StableDiffusionInpaintPipelineLegacy
 
@@ -1063,7 +1046,8 @@ def from_pretrained(
                 "StableDiffusionInpaintPipelineLegacy",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
 
         # 4. Define expected modules given pipeline signature
         # and define non-None initialized modules (=`init_kwargs`)
@@ -1071,26 +1055,15 @@ def from_pretrained(
         # some modules can be passed directly to the init
         # in this case they are already instantiated in `kwargs`
         # extract them here
-        expected_modules, optional_kwargs = cls._get_signature_keys(
-            pipeline_class)
-        passed_class_obj = {
-            k: kwargs.pop(k)
-            for k in expected_modules if k in kwargs
-        }
-        passed_pipe_kwargs = {
-            k: kwargs.pop(k)
-            for k in optional_kwargs if k in kwargs
-        }
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
 
-        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(
-            config_dict, **kwargs)
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
 
         # define init kwargs
-        init_kwargs = {
-            k: init_dict.pop(k)
-            for k in optional_kwargs if k in init_dict
-        }
-        init_kwargs = { ** init_kwargs, ** passed_pipe_kwargs}
+        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
 
         # remove `null` components
         def load_module(name, value):
@@ -1127,8 +1100,7 @@ def load_module(name, value):
 
             # 6.2 Define all importable classes
             is_pipeline_module = hasattr(pipelines, library_name)
-            importable_classes = (ALL_IMPORTABLE_CLASSES if is_pipeline_module
-                                  else LOADABLE_CLASSES[library_name])
+            importable_classes = ALL_IMPORTABLE_CLASSES if is_pipeline_module else LOADABLE_CLASSES[library_name]
             loaded_sub_model = None
 
             # 6.3 Use passed sub model or load class_name from library_name
@@ -1144,7 +1116,8 @@ def load_module(name, value):
                         importable_classes,
                         passed_class_obj,
                         name,
-                        is_pipeline_module, )
+                        is_pipeline_module,
+                    )
 
                 loaded_sub_model = passed_class_obj[name]
             else:
@@ -1164,23 +1137,20 @@ def load_module(name, value):
                     variant=variant,
                     low_cpu_mem_usage=low_cpu_mem_usage,
                     cached_folder=cached_folder,
-                    **load_sub_model_kwargs, )
+                    **load_sub_model_kwargs,
+                )
 
-            init_kwargs[
-                name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
+            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
 
         # 7. Potentially add passed objects if expected
         missing_modules = set(expected_modules) - set(init_kwargs.keys())
         passed_modules = list(passed_class_obj.keys())
         optional_modules = pipeline_class._optional_components
-        if len(missing_modules) > 0 and missing_modules <= set(
-                passed_modules + optional_modules):
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
             for module in missing_modules:
                 init_kwargs[module] = passed_class_obj.get(module, None)
         elif len(missing_modules) > 0:
-            passed_modules = (
-                set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) -
-                optional_kwargs)
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
             raise ValueError(
                 f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
             )
@@ -1195,8 +1165,7 @@ def load_module(name, value):
                 for _submodule in _module:
                     if isinstance(_submodule, nn.Layer):
                         _submodule.eval()
-                        if (paddle_dtype is not None and
-                                _submodule.dtype != paddle_dtype):
+                        if paddle_dtype is not None and _submodule.dtype != paddle_dtype:
                             _submodule.to(dtype=paddle_dtype)
 
         # 9. Instantiate the pipeline
@@ -1210,8 +1179,7 @@ def load_module(name, value):
         return model
 
     @classmethod
-    def download(cls, pretrained_model_name,
-                 **kwargs) -> Union[str, os.PathLike]:
+    def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
         r"""
         Download and cache a PyTorch diffusion pipeline from pre-trained pipeline weights.
         Parameters:
@@ -1284,8 +1252,9 @@ def download(cls, pretrained_model_name,
         </Tip>
         """
         from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub
-                     else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE))
+        cache_dir = (
+            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
+        )
         from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
         resume_download = kwargs.pop("resume_download", False)
         force_download = kwargs.pop("force_download", False)
@@ -1299,8 +1268,7 @@ def download(cls, pretrained_model_name,
         use_safetensors = kwargs.pop("use_safetensors", None)
         max_workers = int(kwargs.pop("max_workers", 1))
 
-        if from_diffusers and use_safetensors and not is_safetensors_available(
-        ):
+        if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
                 "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
             )
@@ -1324,14 +1292,14 @@ def download(cls, pretrained_model_name,
             use_auth_token=use_auth_token,
             revision=revision,
             from_hf_hub=from_hf_hub,
-            return_config_file=True, )
+            return_config_file=True,
+        )
 
         ignore_filenames = config_dict.pop("_ignore_files", [])
 
         # is_fastdeploy_model we wont use safetensors
         if cls == DiffusionPipeline:
-            is_fastdeploy_model = (
-                "fastdeploy" in config_dict.get("_class_name", "").lower())
+            is_fastdeploy_model = "fastdeploy" in config_dict.get("_class_name", "").lower()
         else:
             is_fastdeploy_model = "fastdeploy" in cls.__name__.lower()
         if is_fastdeploy_model:
@@ -1354,46 +1322,38 @@ def download(cls, pretrained_model_name,
                 info = model_info(
                     pretrained_model_name,
                     use_auth_token=use_auth_token,
-                    revision=revision, )
+                    revision=revision,
+                )
 
                 filenames = {sibling.rfilename for sibling in info.siblings}
-                model_filenames, variant_filenames = variant_compatible_siblings(
-                    filenames, variant=variant)
+                model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
 
                 # remove ignored filenames
                 model_filenames = set(model_filenames) - set(ignore_filenames)
-                variant_filenames = set(variant_filenames) - set(
-                    ignore_filenames)
+                variant_filenames = set(variant_filenames) - set(ignore_filenames)
 
                 # if the whole pipeline is cached we don't have to ping the Hub
                 if revision in DEPRECATED_REVISION_ARGS and version.parse(
-                        version.parse(__version__)
-                        .base_version) >= version.parse("0.17.0"):
+                    version.parse(__version__).base_version
+                ) >= version.parse("0.17.0"):
                     warn_deprecated_model_variant(
                         pretrained_model_name,
                         use_auth_token,
                         variant,
                         revision,
-                        model_filenames, )
+                        model_filenames,
+                    )
 
-                model_folder_names = {
-                    os.path.split(f)[0]
-                    for f in model_filenames
-                }
+                model_folder_names = {os.path.split(f)[0] for f in model_filenames}
 
                 # all filenames compatible with variant will be added
                 allow_patterns = list(model_filenames)
 
                 # allow all patterns from non-model folders
                 # this enables downloading schedulers, tokenizers, ...
-                allow_patterns += [
-                    os.path.join(k, "*") for k in folder_names
-                    if k not in model_folder_names
-                ]
+                allow_patterns += [os.path.join(k, "*") for k in folder_names if k not in model_folder_names]
                 # also allow downloading config.json files with the model
-                allow_patterns += [
-                    os.path.join(k, "config.json") for k in model_folder_names
-                ]
+                allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names]
 
                 allow_patterns += [
                     SCHEDULER_CONFIG_NAME,
@@ -1408,24 +1368,28 @@ def download(cls, pretrained_model_name,
                     config_dict,
                     custom_pipeline=custom_pipeline,
                     cache_dir=cache_dir,
-                    revision=custom_revision, )
+                    revision=custom_revision,
+                )
                 expected_components, _ = cls._get_signature_keys(pipeline_class)
-                passed_components = [
-                    k for k in expected_components if k in kwargs
-                ]
+                passed_components = [k for k in expected_components if k in kwargs]
 
-                if (use_safetensors and not allow_pickle and
-                        not is_safetensors_compatible(
-                            model_filenames,
-                            variant=variant,
-                            passed_components=passed_components, )):
+                if (
+                    use_safetensors
+                    and not allow_pickle
+                    and not is_safetensors_compatible(
+                        model_filenames,
+                        variant=variant,
+                        passed_components=passed_components,
+                    )
+                ):
                     raise EnvironmentError(
                         f"Could not found the necessary `safetensors` weights in {model_filenames} (variant={variant})"
                     )
                 elif use_safetensors and is_safetensors_compatible(
-                        model_filenames,
-                        variant=variant,
-                        passed_components=passed_components, ):
+                    model_filenames,
+                    variant=variant,
+                    passed_components=passed_components,
+                ):
                     ignore_patterns = [
                         "*.msgpack",
                         "*.bin",
@@ -1434,79 +1398,50 @@ def download(cls, pretrained_model_name,
                         "*.pdmodel",
                     ]
 
-                    safetensors_variant_filenames = {
-                        f
-                        for f in variant_filenames if f.endswith(".safetensors")
-                    }
-                    safetensors_model_filenames = {
-                        f
-                        for f in model_filenames if f.endswith(".safetensors")
-                    }
-                    if (len(safetensors_variant_filenames) > 0 and
-                            safetensors_model_filenames !=
-                            safetensors_variant_filenames):
+                    safetensors_variant_filenames = {f for f in variant_filenames if f.endswith(".safetensors")}
+                    safetensors_model_filenames = {f for f in model_filenames if f.endswith(".safetensors")}
+                    if (
+                        len(safetensors_variant_filenames) > 0
+                        and safetensors_model_filenames != safetensors_variant_filenames
+                    ):
                         logger.warn(
                             f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(safetensors_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(safetensors_model_filenames - safetensors_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
                         )
                 else:
                     ignore_patterns = ["*.safetensors", "*.msgpack"]
                     if from_diffusers:
-                        ignore_patterns.extend(
-                            ["*.pdparams", "*.pdiparams", "*.pdmodel"])
+                        ignore_patterns.extend(["*.pdparams", "*.pdiparams", "*.pdmodel"])
                         suffix = ".bin"
                     else:
                         if is_fastdeploy_model:
                             ignore_patterns.extend(["*.pdparams", "*.bin"])
                             suffix = ".pdmodel"
                         else:
-                            ignore_patterns.extend(
-                                ["*.pdiparams", "*.pdmodel", "*.bin"])
+                            ignore_patterns.extend(["*.pdiparams", "*.pdmodel", "*.bin"])
                             suffix = ".pdparams"
 
-                    bin_variant_filenames = {
-                        f
-                        for f in variant_filenames if f.endswith(suffix)
-                    }
-                    bin_model_filenames = {
-                        f
-                        for f in model_filenames if f.endswith(suffix)
-                    }
-                    if (len(bin_variant_filenames) > 0 and
-                            bin_model_filenames != bin_variant_filenames):
+                    bin_variant_filenames = {f for f in variant_filenames if f.endswith(suffix)}
+                    bin_model_filenames = {f for f in model_filenames if f.endswith(suffix)}
+                    if len(bin_variant_filenames) > 0 and bin_model_filenames != bin_variant_filenames:
                         logger.warn(
                             f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(bin_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(bin_model_filenames - bin_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
                         )
 
                 # Don't download any objects that are passed
                 allow_patterns = [
-                    p for p in allow_patterns
-                    if not (len(p.split("/")) == 2 and p.split("/")[0] in
-                            passed_components)
+                    p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components)
                 ]
                 # Don't download index files of forbidden patterns either
-                ignore_patterns = ignore_patterns + [
-                    f"{i}.index.*json" for i in ignore_patterns
-                ]
+                ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
 
-                re_ignore_pattern = [
-                    re.compile(fnmatch.translate(p)) for p in ignore_patterns
-                ]
-                re_allow_pattern = [
-                    re.compile(fnmatch.translate(p)) for p in allow_patterns
-                ]
+                re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns]
+                re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns]
 
-                expected_files = [
-                    f for f in filenames
-                    if not any(p.match(f) for p in re_ignore_pattern)
-                ]
-                expected_files = [
-                    f for f in expected_files
-                    if any(p.match(f) for p in re_allow_pattern)
-                ]
+                expected_files = [f for f in filenames if not any(p.match(f) for p in re_ignore_pattern)]
+                expected_files = [f for f in expected_files if any(p.match(f) for p in re_allow_pattern)]
 
                 snapshot_folder = Path(config_file).parent
-                pipeline_is_cached = all((snapshot_folder / f).is_file()
-                                         for f in expected_files)
+                pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
 
                 if pipeline_is_cached:
                     # if the pipeline is cached, we can directly return it
@@ -1514,8 +1449,7 @@ def download(cls, pretrained_model_name,
                     return snapshot_folder
 
             user_agent = {"pipeline_class": cls.__name__}
-            if custom_pipeline is not None and not custom_pipeline.endswith(
-                    ".py"):
+            if custom_pipeline is not None and not custom_pipeline.endswith(".py"):
                 user_agent["custom_pipeline"] = custom_pipeline
 
             # download all allow_patterns - ignore_patterns
@@ -1528,13 +1462,13 @@ def download(cls, pretrained_model_name,
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
                 revision=revision,
-                allow_patterns=list(
-                    set(allow_patterns) - set(ignore_filenames)),
+                allow_patterns=list(set(allow_patterns) - set(ignore_filenames)),
                 ignore_patterns=list(
                     set(ignore_patterns + ignore_filenames)
                 ),  # diffusers bug, so we must add this ignore_filenames!
                 user_agent=user_agent,
-                max_workers=max_workers, )
+                max_workers=max_workers,
+            )
         else:
             # only support [PD] .pdparams, fastdeploy model
             cached_folder = ppdiffusers_bos_dir_download(
@@ -1547,17 +1481,16 @@ def download(cls, pretrained_model_name,
                 variant=variant,
                 is_fastdeploy_model=is_fastdeploy_model,
                 local_files_only=local_files_only,
-                max_workers=max_workers, )
+                max_workers=max_workers,
+            )
 
         return cached_folder
 
     @classmethod
-    def from_pretrained_original_ckpt(
-            cls,
-            pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
-            **kwargs):
-        from .stable_diffusion.convert_from_ckpt_deprecated import \
-            load_pipeline_from_original_stable_diffusion_ckpt
+    def from_pretrained_original_ckpt(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        from .stable_diffusion.convert_from_ckpt_deprecated import (
+            load_pipeline_from_original_stable_diffusion_ckpt,
+        )
 
         resume_download = kwargs.pop("resume_download", False)
         force_download = kwargs.pop("force_download", False)
@@ -1568,37 +1501,33 @@ def from_pretrained_original_ckpt(
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         if os.path.isfile(pretrained_model_name_or_path):
             checkpoint_path = pretrained_model_name_or_path
-        elif pretrained_model_name_or_path.startswith(
-                "http://") or pretrained_model_name_or_path.startswith(
-                    "https://"):
+        elif pretrained_model_name_or_path.startswith("http://") or pretrained_model_name_or_path.startswith(
+            "https://"
+        ):
             checkpoint_path = ppdiffusers_url_download(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
-                force_download=force_download, )
+                force_download=force_download,
+            )
         else:
-            raise EnvironmentError(
-                f"Please check your {pretrained_model_name_or_path}.")
+            raise EnvironmentError(f"Please check your {pretrained_model_name_or_path}.")
         pipeline = load_pipeline_from_original_stable_diffusion_ckpt(
             checkpoint_path=checkpoint_path,
             original_config_file=original_config_file,
             paddle_dtype=paddle_dtype,
             requires_safety_checker=requires_safety_checker,
             cls=cls,
-            **kwargs, )
+            **kwargs,
+        )
 
         return pipeline
 
     @staticmethod
     def _get_signature_keys(obj):
         parameters = inspect.signature(obj.__init__).parameters
-        required_parameters = {
-            k: v
-            for k, v in parameters.items() if v.default == inspect._empty
-        }
-        optional_parameters = set(
-            {k
-             for k, v in parameters.items() if v.default != inspect._empty})
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
         expected_modules = set(required_parameters.keys()) - {"self"}
         return expected_modules, optional_parameters
 
@@ -1628,9 +1557,7 @@ def components(self) -> Dict[str, Any]:
         """
         expected_modules, optional_parameters = self._get_signature_keys(self)
         components = {
-            k: getattr(self, k)
-            for k in self.config.keys()
-            if not k.startswith("_") and k not in optional_parameters
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
         }
 
         if set(components.keys()) != expected_modules:
@@ -1666,8 +1593,7 @@ def progress_bar(self, iterable=None, total=None):
     def set_progress_bar_config(self, **kwargs):
         self._progress_bar_config = kwargs
 
-    def enable_xformers_memory_efficient_attention(
-            self, attention_op: Optional[str]=None):
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
         r"""
         Enable memory efficient attention as implemented in xformers.
 
@@ -1701,15 +1627,13 @@ def disable_xformers_memory_efficient_attention(self):
         """
         self.set_use_memory_efficient_attention_xformers(False)
 
-    def set_use_memory_efficient_attention_xformers(
-            self, valid: bool, attention_op: Optional[str]=None) -> None:
+    def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
         # Recursively walk through all the children.
         # Any children which exposes the set_use_memory_efficient_attention_xformers method
         # gets the message
         def fn_recursive_set_mem_eff(module: nn.Layer):
             if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid,
-                                                                   attention_op)
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
 
             for child in module.children():
                 fn_recursive_set_mem_eff(child)
@@ -1721,8 +1645,7 @@ def fn_recursive_set_mem_eff(module: nn.Layer):
         for module in modules:
             fn_recursive_set_mem_eff(module)
 
-    def enable_attention_slicing(self,
-                                 slice_size: Optional[Union[str, int]]="auto"):
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
 
@@ -1749,10 +1672,7 @@ def disable_attention_slicing(self):
     def set_attention_slice(self, slice_size: Optional[int]):
         module_names, _ = self._get_signature_keys(self)
         modules = [getattr(self, n, None) for n in module_names]
-        modules = [
-            m for m in modules
-            if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice")
-        ]
+        modules = [m for m in modules if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice")]
 
         for module in modules:
             module.set_attention_slice(slice_size)
diff --git a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
index 975204896be93..c946ea77ac787 100644
--- a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
+++ b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
@@ -46,14 +46,14 @@ def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            num_inference_steps: int=50,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Args:
             batch_size (`int`, `optional`, defaults to 1): The number of images to generate.
@@ -80,8 +80,10 @@ def __call__(
                 batch_size,
                 self.unet.config.in_channels,
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, ),
-            generator=generator, )
+                self.unet.config.sample_size,
+            ),
+            generator=generator,
+        )
 
         self.scheduler.set_timesteps(num_inference_steps)
         for t in self.progress_bar(self.scheduler.timesteps):
@@ -95,6 +97,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
index a44fac86017af..b0d248fac49cc 100644
--- a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
+++ b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
@@ -38,11 +38,7 @@ def _preprocess_image(image: Union[List, PIL.Image.Image, paddle.Tensor]):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -62,12 +58,7 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, paddle.Tensor]):
     if isinstance(mask[0], PIL.Image.Image):
         w, h = mask[0].size
         w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
-        mask = [
-            np.array(
-                m.convert("L").resize(
-                    (w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :]
-            for m in mask
-        ]
+        mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask]
         mask = np.concatenate(mask, axis=0)
         mask = mask.astype(np.float32) / 255.0
         mask[mask < 0.5] = 0
@@ -88,17 +79,17 @@ def __init__(self, unet, scheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            mask_image: Union[paddle.Tensor, PIL.Image.Image],
-            num_inference_steps: int=250,
-            eta: float=0.0,
-            jump_length: int=10,
-            jump_n_sample: int=10,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        mask_image: Union[paddle.Tensor, PIL.Image.Image],
+        num_inference_steps: int = 250,
+        eta: float = 0.0,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Args:
             image (`paddle.Tensor` or `PIL.Image.Image`):
@@ -146,12 +137,10 @@ def __call__(
             )
 
         image_shape = original_image.shape
-        image = randn_tensor(
-            image_shape, generator=generator, dtype=self.unet.dtype)
+        image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
 
         # set step values
-        self.scheduler.set_timesteps(num_inference_steps, jump_length,
-                                     jump_n_sample)
+        self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample)
         self.scheduler.eta = eta
 
         t_last = self.scheduler.timesteps[0] + 1
@@ -161,9 +150,7 @@ def __call__(
                 # predict the noise residual
                 model_output = self.unet(image, t).sample
                 # compute previous image: x_t -> x_t-1
-                image = self.scheduler.step(model_output, t, image,
-                                            original_image, mask_image,
-                                            generator).prev_sample
+                image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
 
             else:
                 # compute the reverse: x_t-1 -> x_t
@@ -176,6 +163,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
index e3ce24a7eaf72..4e81855ba00f1 100644
--- a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -40,14 +40,14 @@ def __init__(self, unet: UNet2DModel, scheduler: DiffusionPipeline):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            num_inference_steps: int=2000,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 2000,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -70,25 +70,22 @@ def __call__(
 
         model = self.unet
 
-        sample = (randn_tensor(
-            shape, generator=generator) * self.scheduler.init_noise_sigma)
+        sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
 
         self.scheduler.set_timesteps(num_inference_steps)
         self.scheduler.set_sigmas(num_inference_steps)
 
         for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
-            sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0], ))
+            sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0],))
 
             # correction step
             for _ in range(self.scheduler.config.correct_steps):
                 model_output = self.unet(sample, sigma_t).sample
-                sample = self.scheduler.step_correct(
-                    model_output, sample, generator=generator).prev_sample
+                sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample
 
             # prediction step
             model_output = model(sample, sigma_t).sample
-            output = self.scheduler.step_pred(
-                model_output, t, sample, generator=generator)
+            output = self.scheduler.step_pred(model_output, t, sample, generator=generator)
 
             sample, sample_mean = output.prev_sample, output.prev_sample_mean
 
@@ -98,6 +95,6 @@ def __call__(
             sample = self.numpy_to_pil(sample)
 
         if not return_dict:
-            return (sample, )
+            return (sample,)
 
         return ImagePipelineOutput(images=sample)
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
index e24cb5eee2eb1..9842e59ad078e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -42,5 +42,4 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput):
 
 
 if is_paddle_available() and is_paddlenlp_available():
-    from .pipeline_semantic_stable_diffusion import \
-        SemanticStableDiffusionPipeline
+    from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
index b8778c74b1d86..7fd2b4f407754 100644
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
+++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
@@ -68,8 +68,7 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
         if isinstance(axis, list):
             axis_src, axis_dst = [], []
             for axis_single in axis:
-                if not isinstance(axis_single, int) or not (
-                        axis_single < dims and axis_single >= -dims):
+                if not isinstance(axis_single, int) or not (axis_single < dims and axis_single >= -dims):
                     raise ValueError(
                         "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
                     )
@@ -88,17 +87,13 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
                 axis = axis_dst[0]
         else:
             if not isinstance(axis, int) or not (axis < dims and axis >= -dims):
-                raise ValueError(
-                    "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
-                )
+                raise ValueError("Axis should be None, int, or a list, element should in range [-rank(x), rank(x)).")
             if axis < 0:
                 axis += dims
             out_shape[axis] = 1
 
     mask = x.isnan()
-    valid_counts = mask.logical_not().sum(axis=axis,
-                                          keepdim=True,
-                                          dtype="float64")
+    valid_counts = mask.logical_not().sum(axis=axis, keepdim=True, dtype="float64")
 
     indices = []
 
@@ -127,15 +122,14 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
     for index in indices:
         indices_below = paddle.floor(index).astype(paddle.int32)
         indices_upper = paddle.ceil(index).astype(paddle.int32)
-        tensor_upper = paddle.take_along_axis(
-            sorted_tensor, indices_upper, axis=axis)
-        tensor_below = paddle.take_along_axis(
-            sorted_tensor, indices_below, axis=axis)
+        tensor_upper = paddle.take_along_axis(sorted_tensor, indices_upper, axis=axis)
+        tensor_below = paddle.take_along_axis(sorted_tensor, indices_below, axis=axis)
         weights = index - indices_below.astype("float64")
         out = paddle.lerp(
             tensor_below.astype("float64"),
             tensor_upper.astype("float64"),
-            weights, )
+            weights,
+        )
         if not keepdim:
             out = paddle.squeeze(out, axis=axis)
         else:
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index df0e298fe252a..70eaa17e88188 100644
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -18,13 +18,11 @@
 from typing import Callable, List, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
-from ...pipelines.stable_diffusion.safety_checker import \
-    StableDiffusionSafetyChecker
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging, randn_tensor
 from . import SemanticStableDiffusionPipelineOutput
@@ -107,15 +105,16 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -141,8 +140,9 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
@@ -161,54 +161,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -221,23 +217,26 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -253,33 +252,33 @@ def prepare_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: int=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            editing_prompt: Optional[Union[str, List[str]]]=None,
-            editing_prompt_embeddings: Optional[paddle.Tensor]=None,
-            reverse_editing_direction: Optional[Union[bool, List[bool]]]=False,
-            edit_guidance_scale: Optional[Union[float, List[float]]]=5,
-            edit_warmup_steps: Optional[Union[int, List[int]]]=10,
-            edit_cooldown_steps: Optional[Union[int, List[int]]]=None,
-            edit_threshold: Optional[Union[float, List[float]]]=0.9,
-            edit_momentum_scale: Optional[float]=0.1,
-            edit_mom_beta: Optional[float]=0.4,
-            edit_weights: Optional[List[float]]=None,
-            sem_guidance: Optional[List[paddle.Tensor]]=None, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeddings: Optional[paddle.Tensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 10,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        edit_momentum_scale: Optional[float] = 0.1,
+        edit_mom_beta: Optional[float] = 0.4,
+        edit_weights: Optional[List[float]] = None,
+        sem_guidance: Optional[List[paddle.Tensor]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -399,61 +398,53 @@ def __call__(
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
 
         if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(
-                text_input_ids[:, self.tokenizer.model_max_length:])
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids)[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
         text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if enable_edit_guidance:
             # get safety text embeddings
             if editing_prompt_embeddings is None:
                 edit_concepts_input = self.tokenizer(
-                    [
-                        x
-                        for item in editing_prompt
-                        for x in repeat(item, batch_size)
-                    ],
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
                     padding="max_length",
                     max_length=self.tokenizer.model_max_length,
-                    return_tensors="pd", )
+                    return_tensors="pd",
+                )
 
                 edit_concepts_input_ids = edit_concepts_input.input_ids
 
-                if edit_concepts_input_ids.shape[
-                        -1] > self.tokenizer.model_max_length:
+                if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length:
                     removed_text = self.tokenizer.batch_decode(
-                        edit_concepts_input_ids[:, self.tokenizer.
-                                                model_max_length:])
+                        edit_concepts_input_ids[:, self.tokenizer.model_max_length :]
+                    )
                     logger.warning(
                         "The following part of your input was truncated because CLIP can only handle sequences up to"
                         f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                     )
-                    edit_concepts_input_ids = edit_concepts_input_ids[:, :self.
-                                                                      tokenizer.
-                                                                      model_max_length]
+                    edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
                 edit_concepts = self.text_encoder(edit_concepts_input_ids)[0]
             else:
-                edit_concepts = editing_prompt_embeddings.tile(
-                    [batch_size, 1, 1])
+                edit_concepts = editing_prompt_embeddings.tile([batch_size, 1, 1])
 
             # duplicate text embeddings for each generation per prompt, using mps friendly method
             bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
             edit_concepts = edit_concepts.tile([1, num_images_per_prompt, 1])
-            edit_concepts = edit_concepts.reshape(
-                [bs_embed_edit * num_images_per_prompt, seq_len_edit, -1])
+            edit_concepts = edit_concepts.reshape([bs_embed_edit * num_images_per_prompt, seq_len_edit, -1])
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -468,14 +459,16 @@ def __call__(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -485,25 +478,22 @@ def __call__(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile(
-                [batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
+            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             if enable_edit_guidance:
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings, edit_concepts])
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings, edit_concepts])
             else:
-                text_embeddings = paddle.concat(
-                    [uncond_embeddings, text_embeddings])
+                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
         # get the initial random noise unless the user supplied it
 
         # 4. Prepare timesteps
@@ -519,7 +509,8 @@ def __call__(
             width,
             text_embeddings.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -534,41 +525,39 @@ def __call__(
 
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] *
-                                                (2 + enabled_editing_prompts))
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = (
+                paddle.concat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input, t,
-                encoder_hidden_states=text_embeddings).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
-                noise_pred_out = noise_pred.chunk(
-                    2 + enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts)  # [b,4, 64, 64]
                 noise_pred_uncond, noise_pred_text = (
                     noise_pred_out[0],
-                    noise_pred_out[1], )
+                    noise_pred_out[1],
+                )
                 noise_pred_edit_concepts = noise_pred_out[2:]
 
                 # default text guidance
-                noise_guidance = guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_guidance = guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # noise_guidance = (noise_pred_text - noise_pred_edit_concepts[0])
 
                 if self.uncond_estimates is None:
                     self.uncond_estimates = paddle.zeros(
                         (num_inference_steps + 1, *noise_pred_uncond.shape),
-                        dtype=noise_pred.dtype, )
+                        dtype=noise_pred.dtype,
+                    )
                 self.uncond_estimates[i] = noise_pred_uncond.detach()
 
                 if self.text_estimates is None:
                     self.text_estimates = paddle.zeros(
                         (num_inference_steps + 1, *noise_pred_text.shape),
-                        dtype=noise_pred.dtype, )
+                        dtype=noise_pred.dtype,
+                    )
                 self.text_estimates[i] = noise_pred_text.detach()
 
                 if self.edit_estimates is None and enable_edit_guidance:
@@ -576,29 +565,32 @@ def __call__(
                         (
                             num_inference_steps + 1,
                             len(noise_pred_edit_concepts),
-                            *noise_pred_edit_concepts[0].shape, ),
-                        dtype=noise_pred.dtype, )
+                            *noise_pred_edit_concepts[0].shape,
+                        ),
+                        dtype=noise_pred.dtype,
+                    )
 
                 if self.sem_guidance is None:
                     self.sem_guidance = paddle.zeros(
                         (num_inference_steps + 1, *noise_pred_text.shape),
-                        dtype=noise_pred.dtype, )
+                        dtype=noise_pred.dtype,
+                    )
 
                 if edit_momentum is None:
                     edit_momentum = paddle.zeros_like(noise_guidance)
 
                 if enable_edit_guidance:
                     concept_weights = paddle.zeros(
-                        (len(noise_pred_edit_concepts),
-                         noise_guidance.shape[0]),
-                        dtype=noise_guidance.dtype, )
+                        (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
+                        dtype=noise_guidance.dtype,
+                    )
                     noise_guidance_edit = paddle.zeros(
                         (len(noise_pred_edit_concepts), *noise_guidance.shape),
-                        dtype=noise_guidance.dtype, )
+                        dtype=noise_guidance.dtype,
+                    )
                     # noise_guidance_edit = torch.zeros_like(noise_guidance)
                     warmup_inds = []
-                    for c, noise_pred_edit_concept in enumerate(
-                            noise_pred_edit_concepts):
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
                         self.edit_estimates[i, c] = noise_pred_edit_concept
                         if isinstance(edit_guidance_scale, list):
                             edit_guidance_scale_c = edit_guidance_scale[c]
@@ -610,8 +602,7 @@ def __call__(
                         else:
                             edit_threshold_c = edit_threshold
                         if isinstance(reverse_editing_direction, list):
-                            reverse_editing_direction_c = reverse_editing_direction[
-                                c]
+                            reverse_editing_direction_c = reverse_editing_direction[c]
                         else:
                             reverse_editing_direction_c = reverse_editing_direction
                         if edit_weights:
@@ -632,27 +623,19 @@ def __call__(
                         if i >= edit_warmup_steps_c:
                             warmup_inds.append(c)
                         if i >= edit_cooldown_steps_c:
-                            noise_guidance_edit[
-                                c, :, :, :, :] = paddle.zeros_like(
-                                    noise_pred_edit_concept)
+                            noise_guidance_edit[c, :, :, :, :] = paddle.zeros_like(noise_pred_edit_concept)
                             continue
 
-                        noise_guidance_edit_tmp = (
-                            noise_pred_edit_concept - noise_pred_uncond)
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
                         # tmp_weights = (noise_pred_text - noise_pred_edit_concept).sum(dim=(1, 2, 3))
-                        tmp_weights = (
-                            noise_guidance - noise_pred_edit_concept).sum(
-                                (1, 2, 3))
+                        tmp_weights = (noise_guidance - noise_pred_edit_concept).sum((1, 2, 3))
 
-                        tmp_weights = paddle.full_like(
-                            tmp_weights,
-                            edit_weight_c)  # * (1 / enabled_editing_prompts)
+                        tmp_weights = paddle.full_like(tmp_weights, edit_weight_c)  # * (1 / enabled_editing_prompts)
                         if reverse_editing_direction_c:
                             noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
                         concept_weights[c, :] = tmp_weights
 
-                        noise_guidance_edit_tmp = (noise_guidance_edit_tmp *
-                                                   edit_guidance_scale_c)
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
 
                         # quantile function expects float32
                         if noise_guidance_edit_tmp.dtype == paddle.float32:
@@ -660,23 +643,22 @@ def __call__(
                                 paddle.abs(noise_guidance_edit_tmp).flatten(2),
                                 edit_threshold_c,
                                 axis=2,
-                                keepdim=False, )
+                                keepdim=False,
+                            )
                         else:
                             tmp = quantile(
-                                paddle.abs(noise_guidance_edit_tmp).flatten(2)
-                                .cast(paddle.float32),
+                                paddle.abs(noise_guidance_edit_tmp).flatten(2).cast(paddle.float32),
                                 edit_threshold_c,
                                 axis=2,
                                 keepdim=False,
                             ).cast(noise_guidance_edit_tmp.dtype)
 
                         noise_guidance_edit_tmp = paddle.where(
-                            paddle.abs(noise_guidance_edit_tmp) >=
-                            tmp[:, :, None, None],
+                            paddle.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None],
                             noise_guidance_edit_tmp,
-                            paddle.zeros_like(noise_guidance_edit_tmp), )
-                        noise_guidance_edit[
-                            c, :, :, :, :] = noise_guidance_edit_tmp
+                            paddle.zeros_like(noise_guidance_edit_tmp),
+                        )
+                        noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
 
                         # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
 
@@ -685,22 +667,21 @@ def __call__(
                         # concept_weights = concept_weights.to("cpu")  # Offload to cpu
                         # noise_guidance_edit = noise_guidance_edit.to("cpu")
 
-                        concept_weights_tmp = paddle.index_select(
-                            concept_weights, warmup_inds, 0)
+                        concept_weights_tmp = paddle.index_select(concept_weights, warmup_inds, 0)
                         concept_weights_tmp = paddle.where(
                             concept_weights_tmp < 0,
                             paddle.zeros_like(concept_weights_tmp),
-                            concept_weights_tmp, )
-                        concept_weights_tmp = (concept_weights_tmp /
-                                               concept_weights_tmp.sum(0))
+                            concept_weights_tmp,
+                        )
+                        concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(0)
                         # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
 
-                        noise_guidance_edit_tmp = paddle.index_select(
-                            noise_guidance_edit, warmup_inds, 0)
+                        noise_guidance_edit_tmp = paddle.index_select(noise_guidance_edit, warmup_inds, 0)
                         noise_guidance_edit_tmp = paddle.einsum(
                             "cb,cbijk->bijk",
                             concept_weights_tmp,
-                            noise_guidance_edit_tmp, )
+                            noise_guidance_edit_tmp,
+                        )
                         noise_guidance_edit_tmp = noise_guidance_edit_tmp
                         noise_guidance = noise_guidance + noise_guidance_edit_tmp
 
@@ -714,17 +695,15 @@ def __call__(
                     concept_weights = paddle.where(
                         concept_weights < 0,
                         paddle.zeros_like(concept_weights),
-                        concept_weights, )
+                        concept_weights,
+                    )
                     # concept_weights = paddle.nan_to_num(concept_weights)
 
-                    noise_guidance_edit = paddle.einsum(
-                        "cb,cbijk->bijk", concept_weights, noise_guidance_edit)
+                    noise_guidance_edit = paddle.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
 
-                    noise_guidance_edit = (noise_guidance_edit +
-                                           edit_momentum_scale * edit_momentum)
+                    noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
 
-                    edit_momentum = (edit_mom_beta * edit_momentum +
-                                     (1 - edit_mom_beta) * noise_guidance_edit)
+                    edit_momentum = edit_mom_beta * edit_momentum + (1 - edit_mom_beta) * noise_guidance_edit
 
                     if warmup_inds.shape[0] == len(noise_pred_edit_concepts):
                         noise_guidance = noise_guidance + noise_guidance_edit
@@ -737,8 +716,7 @@ def __call__(
                 noise_pred = noise_pred_uncond + noise_guidance
 
                 # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -748,12 +726,11 @@ def __call__(
         image = self.decode_latents(latents)
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
                 images=image,
-                clip_input=safety_checker_input.pixel_values.cast(
-                    text_embeddings.dtype), )
+                clip_input=safety_checker_input.pixel_values.cast(text_embeddings.dtype),
+            )
         else:
             has_nsfw_concept = None
 
@@ -763,5 +740,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return SemanticStableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
index 44d2a3ed3c947..53dd30da98557 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...utils import (OptionalDependencyNotAvailable, is_note_seq_available,
-                      is_paddle_available, is_paddlenlp_available)
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_note_seq_available,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 try:
     if not (is_paddlenlp_available() and is_paddle_available()):
@@ -23,10 +27,12 @@
 else:
     from .notes_encoder import SpectrogramNotesEncoder
     from .pipeline_spectrogram_diffusion import (
-        SpectrogramContEncoder, SpectrogramDiffusionPipeline, T5FilmDecoder)
+        SpectrogramContEncoder,
+        SpectrogramDiffusionPipeline,
+        T5FilmDecoder,
+    )
 try:
-    if not (is_paddlenlp_available() and is_paddle_available() and
-            is_note_seq_available()):
+    if not (is_paddlenlp_available() and is_paddle_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
index 4378ce01e5784..d09306582dc21 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
@@ -17,28 +17,27 @@
 from paddlenlp.transformers.t5.configuration import T5Config
 from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm
 
-from ...configuration_utils import (ConfigMixin, ModuleUtilsMixin,
-                                    register_to_config)
+from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config
 from ...models import ModelMixin
 
 
 class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
     @register_to_config
     def __init__(
-            self,
-            input_dims: int,
-            targets_context_length: int,
-            d_model: int,
-            dropout_rate: float,
-            num_layers: int,
-            num_heads: int,
-            d_kv: int,
-            d_ff: int,
-            feed_forward_proj: str,
-            is_decoder: bool=False, ):
+        self,
+        input_dims: int,
+        targets_context_length: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
         super().__init__()
-        self.input_proj = nn.Linear(
-            in_features=input_dims, out_features=d_model, bias_attr=False)
+        self.input_proj = nn.Linear(in_features=input_dims, out_features=d_model, bias_attr=False)
         self.position_encoding = nn.Embedding(targets_context_length, d_model)
         self.position_encoding.weight.stop_gradient = True
         self.dropout_pre = nn.Dropout(p=dropout_rate)
@@ -50,7 +49,8 @@ def __init__(
             feed_forward_proj=feed_forward_proj,
             dropout_rate=dropout_rate,
             is_decoder=is_decoder,
-            is_encoder_decoder=False, )
+            is_encoder_decoder=False,
+        )
         self.encoders = nn.LayerList()
         for lyr_num in range(num_layers):
             lyr = T5Block(t5config)
@@ -66,17 +66,13 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
         input_positions = paddle.arange(end=max_positions)
 
         seq_lens = encoder_inputs_mask.sum(axis=-1)
-        input_positions = paddle.roll(
-            x=input_positions.unsqueeze(axis=0),
-            shifts=tuple(seq_lens.tolist()),
-            axis=0)
+        input_positions = paddle.roll(x=input_positions.unsqueeze(axis=0), shifts=tuple(seq_lens.tolist()), axis=0)
         x += self.position_encoding(input_positions)
         x = self.dropout_pre(x)
 
         # inverted the attention mask
         input_shape = encoder_inputs.shape
-        extended_attention_mask = self.get_extended_attention_mask(
-            encoder_inputs_mask, input_shape)
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
 
         for lyr in self.encoders:
             x = lyr(x, extended_attention_mask)[0]
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
index d8dcc8a98cf87..3997ce07f5845 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -15,8 +15,17 @@
 import dataclasses
 import math
 import os
-from typing import (Any, Callable, List, Mapping, MutableMapping, Optional,
-                    Sequence, Tuple, Union)
+from typing import (
+    Any,
+    Callable,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 import paddle
@@ -96,8 +105,7 @@ class NoteEncodingState:
     """Encoding state for note transcription, keeping track of active pitches."""
 
     # velocity bin for active pitches and programs
-    active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(
-        default_factory=dict)
+    active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict)
 
 
 @dataclasses.dataclass
@@ -149,10 +157,11 @@ class Codec:
     """
 
     def __init__(
-            self,
-            max_shift_steps: int,
-            steps_per_second: float,
-            event_ranges: List[EventRange], ):
+        self,
+        max_shift_steps: int,
+        steps_per_second: float,
+        event_ranges: List[EventRange],
+    ):
         """Define Codec.
 
         Args:
@@ -162,14 +171,11 @@ def __init__(
           event_ranges: Other supported event types and their ranges.
         """
         self.steps_per_second = steps_per_second
-        self._shift_range = EventRange(
-            type="shift", min_value=0, max_value=max_shift_steps)
+        self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps)
         self._event_ranges = [self._shift_range] + event_ranges
 
         # Ensure all event types have unique names.
-        assert len(self._event_ranges) == len(
-            {er.type
-             for er in self._event_ranges})
+        assert len(self._event_ranges) == len({er.type for er in self._event_ranges})
 
     @property
     def num_classes(self) -> int:
@@ -179,8 +185,7 @@ def num_classes(self) -> int:
     # events that are intended to be used from within autograph functions.
 
     def is_shift_event_index(self, index: int) -> bool:
-        return (self._shift_range.min_value <= index and
-                index <= self._shift_range.max_value)
+        return self._shift_range.min_value <= index and index <= self._shift_range.max_value
 
     @property
     def max_shift_steps(self) -> int:
@@ -235,31 +240,29 @@ def programs_to_midi_classes(tokens, codec):
     """Modifies program events to be the first program in the MIDI class."""
     min_program_id, max_program_id = codec.event_type_range("program")
     is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
-    return np.where(is_program, min_program_id + 8 * (
-        (tokens - min_program_id) // 8), tokens)
+    return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
 
 
 PROGRAM_GRANULARITIES = {
     # "flat" granularity; drop program change tokens and set NoteSequence
     # programs to zero
-    "flat": ProgramGranularity(
-        tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
+    "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
     # map each program to the first program in its MIDI class
     "midi_class": ProgramGranularity(
         tokens_map_fn=programs_to_midi_classes,
-        program_map_fn=lambda program: 8 * (program // 8), ),
+        program_map_fn=lambda program: 8 * (program // 8),
+    ),
     # leave programs as is
     "full": ProgramGranularity(
         tokens_map_fn=lambda tokens, codec: tokens,
-        program_map_fn=lambda program: program, ),
+        program_map_fn=lambda program: program,
+    ),
 }
 
 
 def unfold(tensor, dimension, size, step=1):
-    assert dimension < len(
-        tensor.shape), "dimension must be less than tensor dimensions"
-    assert (tensor.shape[dimension] >= size
-            ), "size should not be greater than the dimension of tensor"
+    assert dimension < len(tensor.shape), "dimension must be less than tensor dimensions"
+    assert tensor.shape[dimension] >= size, "size should not be greater than the dimension of tensor"
 
     slices = []
     for i in range(0, tensor.shape[dimension] - size + 1, step):
@@ -276,24 +279,19 @@ def unfold(tensor, dimension, size, step=1):
     return unfolded_tensor
 
 
-def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0,
-          axis=-1):
+def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
     """
     equivalent of tf.signal.frame
     """
     signal_length = signal.shape[axis]
     if pad_end:
         frames_overlap = frame_length - frame_step
-        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(
-            frame_length - frames_overlap)
+        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
         pad_size = int(frame_length - rest_samples)
         if pad_size != 0:
             pad_axis = [0] * signal.ndim
             pad_axis[axis] = pad_size
-            signal = F.pad(x=signal,
-                           pad=pad_axis,
-                           mode="constant",
-                           value=pad_value)
+            signal = F.pad(x=signal, pad=pad_axis, mode="constant", value=pad_value)
     frames = unfold(signal, axis, frame_length, frame_step)
     return frames
 
@@ -305,28 +303,26 @@ def program_to_slakh_program(program):
             return slakh_program
 
 
-def audio_to_frames(
-        samples, hop_size: int,
-        frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]:
+def audio_to_frames(samples, hop_size: int, frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]:
     """Convert audio samples to non-overlapping frames and frame times."""
     frame_size = hop_size
-    samples = np.pad(samples, [0, frame_size - len(samples) % frame_size],
-                     mode="constant")
+    samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant")
 
     # Split audio into frames.
     frames = frame(
         paddle.to_tensor(data=samples).unsqueeze(axis=0),
         frame_length=frame_size,
         frame_step=frame_size,
-        pad_end=False, )
+        pad_end=False,
+    )
     num_frames = len(samples) // frame_size
     times = np.arange(num_frames) / frame_rate
     return frames, times
 
 
 def note_sequence_to_onsets_and_offsets_and_programs(
-        ns: note_seq.NoteSequence, ) -> Tuple[Sequence[float], Sequence[
-            NoteEventData]]:
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
     """Extract onset & offset times and pitches & programs from a NoteSequence.
 
     The onset & offset times will not necessarily be in sorted order.
@@ -341,21 +337,20 @@ def note_sequence_to_onsets_and_offsets_and_programs(
     """
     # Sort by program and pitch and put offsets before onsets as a tiebreaker for
     # subsequent stable sort.
-    notes = sorted(
-        ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
-    times = [note.end_time for note in notes if not note.is_drum] + [
-        note.start_time for note in notes
-    ]
+    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
+    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
     values = [
-        NoteEventData(
-            pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
-        for note in notes if not note.is_drum
+        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
+        for note in notes
+        if not note.is_drum
     ] + [
         NoteEventData(
             pitch=note.pitch,
             velocity=note.velocity,
             program=note.program,
-            is_drum=note.is_drum, ) for note in notes
+            is_drum=note.is_drum,
+        )
+        for note in notes
     ]
     return times, values
 
@@ -368,20 +363,19 @@ def num_velocity_bins_from_codec(codec: Codec):
 
 # segment an array into segments of length n
 def segment(a, n):
-    return [a[i:i + n] for i in range(0, len(a), n)]
+    return [a[i : i + n] for i in range(0, len(a), n)]
 
 
 def velocity_to_bin(velocity, num_velocity_bins):
     if velocity == 0:
         return 0
     else:
-        return math.ceil(num_velocity_bins * velocity /
-                         note_seq.MAX_MIDI_VELOCITY)
+        return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
 
 
-def note_event_data_to_events(state: Optional[NoteEncodingState],
-                              value: NoteEventData,
-                              codec: Codec) -> Sequence[Event]:
+def note_event_data_to_events(
+    state: Optional[NoteEncodingState], value: NoteEventData, codec: Codec
+) -> Sequence[Event]:
     """Convert note event data to a sequence of events."""
     if value.velocity is None:
         # onsets only, no program or velocity
@@ -393,9 +387,7 @@ def note_event_data_to_events(state: Optional[NoteEncodingState],
             # onsets + offsets + velocities only, no programs
             if state is not None:
                 state.active_pitches[value.pitch, 0] = velocity_bin
-            return [
-                Event("velocity", velocity_bin), Event("pitch", value.pitch)
-            ]
+            return [Event("velocity", velocity_bin), Event("pitch", value.pitch)]
         elif value.is_drum:
             # drum events use a separate vocabulary
             return [Event("velocity", velocity_bin), Event("drum", value.pitch)]
@@ -413,8 +405,7 @@ def note_event_data_to_events(state: Optional[NoteEncodingState],
 def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
     """Output program and pitch events for active notes plus a final tie event."""
     events = []
-    for pitch, program in sorted(
-            state.active_pitches.keys(), key=lambda k: k[::-1]):
+    for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]):
         if state.active_pitches[pitch, program]:
             events += [Event("program", program), Event("pitch", pitch)]
     events.append(Event("tie", 0))
@@ -422,13 +413,14 @@ def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
 
 
 def encode_and_index_events(
-        state,
-        event_times,
-        event_values,
-        codec,
-        frame_times,
-        encode_event_fn,
-        encoding_state_to_events_fn=None, ):
+    state,
+    event_times,
+    event_values,
+    codec,
+    frame_times,
+    encode_event_fn,
+    encoding_state_to_events_fn=None,
+):
     """Encode a sequence of timed events and index to audio frame times.
 
     Encodes time shifts as repeated single step shifts for later run length encoding.
@@ -460,9 +452,7 @@ def encode_and_index_events(
       state_event_indices: Corresponding state event index for every audio frame.
     """
     indices = np.argsort(event_times, kind="stable")
-    event_steps = [
-        round(event_times[i] * codec.steps_per_second) for i in indices
-    ]
+    event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices]
     event_values = [event_values[i] for i in indices]
     events = []
     state_events = []
@@ -473,9 +463,10 @@ def encode_and_index_events(
     cur_state_event_idx = 0
 
     def fill_event_start_indices_to_cur_step():
-        while (len(event_start_indices) < len(frame_times) and
-               frame_times[len(event_start_indices)] < cur_step /
-               codec.steps_per_second):
+        while (
+            len(event_start_indices) < len(frame_times)
+            and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second
+        ):
             event_start_indices.append(cur_event_idx)
             state_event_indices.append(cur_state_event_idx)
 
@@ -511,28 +502,24 @@ def fill_event_start_indices_to_cur_step():
 
     events = np.array(events).astype(np.int32)
     state_events = np.array(state_events).astype(np.int32)
-    event_start_indices = segment(
-        np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
-    event_end_indices = segment(
-        np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
-    state_event_indices = segment(
-        np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
     outputs = []
-    for start_indices, end_indices, event_indices in zip(
-            event_start_indices, event_end_indices, state_event_indices):
-        outputs.append({
-            "inputs": events,
-            "event_start_indices": start_indices,
-            "event_end_indices": end_indices,
-            "state_events": state_events,
-            "state_event_indices": event_indices,
-        })
+    for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
+        outputs.append(
+            {
+                "inputs": events,
+                "event_start_indices": start_indices,
+                "event_end_indices": end_indices,
+                "state_events": state_events,
+                "state_event_indices": event_indices,
+            }
+        )
     return outputs
 
 
-def extract_sequence_with_indices(features,
-                                  state_events_end_token=None,
-                                  feature_key="inputs"):
+def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"):
     """Extract target sequence corresponding to audio token segment."""
     features = features.copy()
     start_idx = features["event_start_indices"][0]
@@ -543,36 +530,33 @@ def extract_sequence_with_indices(features,
         # prepend them to the targets array.
         state_event_start_idx = features["state_event_indices"][0]
         state_event_end_idx = state_event_start_idx + 1
-        while (features["state_events"][state_event_end_idx - 1] !=
-               state_events_end_token):
+        while features["state_events"][state_event_end_idx - 1] != state_events_end_token:
             state_event_end_idx += 1
         features[feature_key] = np.concatenate(
             [
-                features["state_events"][state_event_start_idx:
-                                         state_event_end_idx],
+                features["state_events"][state_event_start_idx:state_event_end_idx],
                 features[feature_key],
             ],
-            axis=0, )
+            axis=0,
+        )
     return features
 
 
-def map_midi_programs(feature,
-                      codec: Codec,
-                      granularity_type: str="full",
-                      feature_key: str="inputs") -> Mapping[str, Any]:
+def map_midi_programs(
+    feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs"
+) -> Mapping[str, Any]:
     """Apply MIDI program map to token sequences."""
     granularity = PROGRAM_GRANULARITIES[granularity_type]
-    feature[feature_key] = granularity.tokens_map_fn(feature[feature_key],
-                                                     codec)
+    feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec)
     return feature
 
 
 def run_length_encode_shifts_fn(
-        features,
-        codec: Codec,
-        feature_key: str="inputs",
-        state_change_event_types: Sequence[str]=(
-        ), ) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+    features,
+    codec: Codec,
+    feature_key: str = "inputs",
+    state_change_event_types: Sequence[str] = (),
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
     """Return a function that run-length encodes shifts for a given codec.
 
     Args:
@@ -585,13 +569,9 @@ def run_length_encode_shifts_fn(
     Returns:
       A preprocessing function that run-length encodes single-step shifts.
     """
-    state_change_event_ranges = [
-        codec.event_type_range(event_type)
-        for event_type in state_change_event_types
-    ]
+    state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types]
 
-    def run_length_encode_shifts(
-            features: MutableMapping[str, Any]) -> Mapping[str, Any]:
+    def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]:
         """Combine leading/interior shifts, trim trailing shifts.
 
         Args:
@@ -613,8 +593,7 @@ def run_length_encode_shifts(
                 # If this event is a state change and has the same value as the current
                 # state, we can skip it entirely.
                 is_redundant = False
-                for i, (min_index,
-                        max_index) in enumerate(state_change_event_ranges):
+                for i, (min_index, max_index) in enumerate(state_change_event_ranges):
                     if min_index <= event and event <= max_index:
                         if current_state[i] == event:
                             is_redundant = True
@@ -627,10 +606,8 @@ def run_length_encode_shifts(
                 if shift_steps > 0:
                     shift_steps = total_shift_steps
                     while shift_steps > 0:
-                        output_steps = np.minimum(codec.max_shift_steps,
-                                                  shift_steps)
-                        output = np.concatenate(
-                            [output, [output_steps]], axis=0)
+                        output_steps = np.minimum(codec.max_shift_steps, shift_steps)
+                        output = np.concatenate([output, [output_steps]], axis=0)
                         shift_steps -= output_steps
                 output = np.concatenate([output, [event]], axis=0)
         features[feature_key] = output
@@ -639,42 +616,32 @@ def run_length_encode_shifts(
     return run_length_encode_shifts(features)
 
 
-def note_representation_processor_chain(
-        features,
-        codec: Codec,
-        note_representation_config: NoteRepresentationConfig):
+def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig):
     tie_token = codec.encode_event(Event("tie", 0))
-    state_events_end_token = (tie_token if
-                              note_representation_config.include_ties else None)
+    state_events_end_token = tie_token if note_representation_config.include_ties else None
     features = extract_sequence_with_indices(
-        features,
-        state_events_end_token=state_events_end_token,
-        feature_key="inputs")
+        features, state_events_end_token=state_events_end_token, feature_key="inputs"
+    )
     features = map_midi_programs(features, codec)
-    features = run_length_encode_shifts_fn(
-        features, codec, state_change_event_types=["velocity", "program"])
+    features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
     return features
 
 
 class MidiProcessor:
     def __init__(self):
         self.codec = Codec(
-            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS *
-            DEFAULT_STEPS_PER_SECOND,
+            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
             steps_per_second=DEFAULT_STEPS_PER_SECOND,
             event_ranges=[
-                EventRange("pitch", note_seq.MIN_MIDI_PITCH,
-                           note_seq.MAX_MIDI_PITCH),
+                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
                 EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
                 EventRange("tie", 0, 0),
-                EventRange("program", note_seq.MIN_MIDI_PROGRAM,
-                           note_seq.MAX_MIDI_PROGRAM),
-                EventRange("drum", note_seq.MIN_MIDI_PITCH,
-                           note_seq.MAX_MIDI_PITCH),
-            ], )
+                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
+                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+            ],
+        )
         self.tokenizer = Tokenizer(self.codec.num_classes)
-        self.note_representation_config = NoteRepresentationConfig(
-            onsets_only=False, include_ties=True)
+        self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
 
     def __call__(self, midi: Union[bytes, os.PathLike, str]):
         if not isinstance(midi, bytes):
@@ -695,13 +662,10 @@ def __call__(self, midi: Union[bytes, os.PathLike, str]):
             frame_times=frame_times,
             codec=self.codec,
             encode_event_fn=note_event_data_to_events,
-            encoding_state_to_events_fn=note_encoding_state_to_events, )
+            encoding_state_to_events_fn=note_encoding_state_to_events,
+        )
         events = [
-            note_representation_processor_chain(event, self.codec,
-                                                self.note_representation_config)
-            for event in events
-        ]
-        input_tokens = [
-            self.tokenizer.encode(event["inputs"]) for event in events
+            note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events
         ]
+        input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
         return input_tokens
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
index 73d0d48ee3f28..bcf4c659a6e5f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
@@ -17,25 +17,25 @@
 from paddlenlp.transformers.t5.configuration import T5Config
 from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm
 
-from ...configuration_utils import (ConfigMixin, ModuleUtilsMixin,
-                                    register_to_config)
+from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config
 from ...models import ModelMixin
 
 
 class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
     @register_to_config
     def __init__(
-            self,
-            max_length: int,
-            vocab_size: int,
-            d_model: int,
-            dropout_rate: float,
-            num_layers: int,
-            num_heads: int,
-            d_kv: int,
-            d_ff: int,
-            feed_forward_proj: str,
-            is_decoder: bool=False, ):
+        self,
+        max_length: int,
+        vocab_size: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
         super().__init__()
         self.token_embedder = nn.Embedding(vocab_size, d_model)
         self.position_encoding = nn.Embedding(max_length, d_model)
@@ -50,7 +50,8 @@ def __init__(
             dropout_rate=dropout_rate,
             feed_forward_proj=feed_forward_proj,
             is_decoder=is_decoder,
-            is_encoder_decoder=False, )
+            is_encoder_decoder=False,
+        )
         self.encoders = nn.LayerList()
         for lyr_num in range(num_layers):
             lyr = T5Block(t5config)
@@ -67,8 +68,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
 
         # inverted the attention mask
         input_shape = encoder_input_tokens.shape
-        extended_attention_mask = self.get_extended_attention_mask(
-            encoder_inputs_mask, input_shape)
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
         for lyr in self.encoders:
             x = lyr(x, extended_attention_mask)[0]
         x = self.layer_norm(x)
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index a7c2673f560f3..000fc9a868b02 100644
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -33,12 +33,13 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline):
     _optional_components = ["melgan"]
 
     def __init__(
-            self,
-            notes_encoder: SpectrogramNotesEncoder,
-            continuous_encoder: SpectrogramContEncoder,
-            decoder: T5FilmDecoder,
-            scheduler: DDPMScheduler,
-            melgan: (Any), ) -> None:
+        self,
+        notes_encoder: SpectrogramNotesEncoder,
+        continuous_encoder: SpectrogramContEncoder,
+        decoder: T5FilmDecoder,
+        scheduler: DDPMScheduler,
+        melgan: (Any),
+    ) -> None:
         super().__init__()
 
         # From MELGAN
@@ -50,25 +51,23 @@ def __init__(
             continuous_encoder=continuous_encoder,
             decoder=decoder,
             scheduler=scheduler,
-            melgan=melgan, )
+            melgan=melgan,
+        )
 
     def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
         """Linearly scale features to network outputs range."""
         min_out, max_out = output_range
         if clip:
-            features = paddle.clip(
-                x=features, min=self.min_value, max=self.max_value)
+            features = paddle.clip(x=features, min=self.min_value, max=self.max_value)
         # Scale to [0, 1].
-        zero_one = (features - self.min_value) / (
-            self.max_value - self.min_value)
+        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
         # Scale to [min_out, max_out].
         return zero_one * (max_out - min_out) + min_out
 
     def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
         """Invert by linearly scaling network outputs to features range."""
         min_out, max_out = input_range
-        outputs = paddle.clip(
-            x=outputs, min=min_out, max=max_out) if clip else outputs
+        outputs = paddle.clip(x=outputs, min=min_out, max=max_out) if clip else outputs
         # Scale to [0, 1].
         zero_one = (outputs - min_out) / (max_out - min_out)
         # Scale to [self.min_value, self.max_value].
@@ -77,29 +76,27 @@ def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
     def encode(self, input_tokens, continuous_inputs, continuous_mask):
         tokens_mask = input_tokens > 0
         tokens_encoded, tokens_mask = self.notes_encoder(
-            encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask)
+            encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask
+        )
         continuous_encoded, continuous_mask = self.continuous_encoder(
-            encoder_inputs=continuous_inputs.cast(
-                self.continuous_encoder.dtype),
-            encoder_inputs_mask=continuous_mask, )
-        return [(tokens_encoded, tokens_mask), (continuous_encoded,
-                                                continuous_mask)]
+            encoder_inputs=continuous_inputs.cast(self.continuous_encoder.dtype),
+            encoder_inputs_mask=continuous_mask,
+        )
+        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
 
     def decode(self, encodings_and_masks, input_tokens, noise_time):
         timesteps = noise_time
         if not paddle.is_tensor(x=timesteps):
-            timesteps = paddle.to_tensor(
-                data=[timesteps], dtype="int64", place=input_tokens.place)
+            timesteps = paddle.to_tensor(data=[timesteps], dtype="int64", place=input_tokens.place)
         elif paddle.is_tensor(x=timesteps) and len(timesteps.shape) == 0:
             if isinstance(input_tokens.place, paddle.dtype):
                 dtype = input_tokens.place
-            elif isinstance(input_tokens.place,
-                            str) and input_tokens.place not in [
-                                "cpu",
-                                "cuda",
-                                "ipu",
-                                "xpu",
-                            ]:
+            elif isinstance(input_tokens.place, str) and input_tokens.place not in [
+                "cpu",
+                "cuda",
+                "ipu",
+                "xpu",
+            ]:
                 dtype = input_tokens.place
             elif isinstance(input_tokens.place, paddle.Tensor):
                 dtype = input_tokens.place.dtype
@@ -107,40 +104,41 @@ def decode(self, encodings_and_masks, input_tokens, noise_time):
                 dtype = timesteps[None].dtype
             timesteps = timesteps[None].cast(dtype)
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps * paddle.ones(
-            shape=input_tokens.shape[0], dtype=timesteps.dtype)
+        timesteps = timesteps * paddle.ones(shape=input_tokens.shape[0], dtype=timesteps.dtype)
         logits = self.decoder(
             encodings_and_masks=encodings_and_masks,
             decoder_input_tokens=input_tokens,
-            decoder_noise_time=timesteps, )
+            decoder_noise_time=timesteps,
+        )
         return logits
 
     @paddle.no_grad()
     def __call__(
-            self,
-            input_tokens: List[List[int]],
-            generator: Optional[paddle.Generator]=None,
-            num_inference_steps: int=100,
-            return_dict: bool=True,
-            output_type: str="numpy",
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1, ) -> Union[AudioPipelineOutput, Tuple]:
-        if (callback_steps is None or callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+        self,
+        input_tokens: List[List[int]],
+        generator: Optional[paddle.Generator] = None,
+        num_inference_steps: int = 100,
+        return_dict: bool = True,
+        output_type: str = "numpy",
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+    ) -> Union[AudioPipelineOutput, Tuple]:
+        if (
+            callback_steps is None
+            or callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
             )
-        pred_mel = np.zeros(
-            [1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
+        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
         ones = paddle.ones(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
         for i, encoder_input_tokens in enumerate(input_tokens):
             if i == 0:
-                encoder_continuous_inputs = paddle.to_tensor(
-                    data=pred_mel[:1].copy()).cast(self.decoder.dtype)
+                encoder_continuous_inputs = paddle.to_tensor(data=pred_mel[:1].copy()).cast(self.decoder.dtype)
                 # The first chunk has no previous context.
-                encoder_continuous_mask = paddle.zeros(
-                    shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
+                encoder_continuous_mask = paddle.zeros(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
             else:
                 # The full song pipeline does not feed in a context feature, so the mask
                 # will be all 0s after the feature converter. Because we know we're
@@ -148,17 +146,19 @@ def __call__(
                 # to all 1s.
                 encoder_continuous_mask = ones
             encoder_continuous_inputs = self.scale_features(
-                encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
+                encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
+            )
             encodings_and_masks = self.encode(
-                input_tokens=paddle.to_tensor(
-                    data=[encoder_input_tokens], dtype="int32"),
+                input_tokens=paddle.to_tensor(data=[encoder_input_tokens], dtype="int32"),
                 continuous_inputs=encoder_continuous_inputs,
-                continuous_mask=encoder_continuous_mask, )
+                continuous_mask=encoder_continuous_mask,
+            )
             # Sample encoder_continuous_inputs shaped gaussian noise to begin loop
             x = randn_tensor(
                 shape=encoder_continuous_inputs.shape,
                 generator=generator,
-                dtype=self.decoder.dtype, )
+                dtype=self.decoder.dtype,
+            )
             # set step values
             self.scheduler.set_timesteps(num_inference_steps)
             # Denoising diffusion loop
@@ -166,26 +166,24 @@ def __call__(
                 output = self.decode(
                     encodings_and_masks=encodings_and_masks,
                     input_tokens=x,
-                    noise_time=t / self.scheduler.config.num_train_timesteps, )
+                    noise_time=t / self.scheduler.config.num_train_timesteps,
+                )
 
                 # Compute previous output: x_t -> x_t-1
-                x = self.scheduler.step(
-                    output, t, x, generator=generator).prev_sample
+                x = self.scheduler.step(output, t, x, generator=generator).prev_sample
             mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
             encoder_continuous_inputs = mel[:1]
             pred_mel = mel.cpu().astype(dtype="float32").numpy()
-            full_pred_mel = np.concatenate(
-                [full_pred_mel, pred_mel[:1]], axis=1)
+            full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
                 callback(i, full_pred_mel)
             logger.info("Generated segment", i)
         if output_type == "numpy":
-            output = self.melgan(
-                input_features=full_pred_mel.astype(np.float32))[0]
+            output = self.melgan(input_features=full_pred_mel.astype(np.float32))[0]
         else:
             output = full_pred_mel
         if not return_dict:
-            return (output, )
+            return (output,)
         return AudioPipelineOutput(audios=output)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
index 5bcf303c00772..fa4dcc515380f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
@@ -19,10 +19,15 @@
 import numpy as np
 import PIL.Image
 
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
-                      is_fastdeploy_available, is_k_diffusion_available,
-                      is_k_diffusion_version, is_paddle_available,
-                      is_paddlenlp_available)
+from ...utils import (
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    is_fastdeploy_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 
 @dataclass
@@ -51,44 +56,46 @@ class StableDiffusionPipelineOutput(BaseOutput):
 else:
     # new added
     from .hf_clip_model import (
-        HFCLIPModel, HFCLIPTextModel, HFCLIPTextModelWithProjection,
-        HFCLIPVisionModel, HFCLIPVisionModelWithProjection)
+        HFCLIPModel,
+        HFCLIPTextModel,
+        HFCLIPTextModelWithProjection,
+        HFCLIPVisionModel,
+        HFCLIPVisionModelWithProjection,
+    )
     from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
-    from .pipeline_stable_diffusion_adapter import \
-        StableDiffusionAdapterPipeline
-    from .pipeline_stable_diffusion_all_in_one import \
-        StableDiffusionPipelineAllinOne
-    from .pipeline_stable_diffusion_attend_and_excite import \
-        StableDiffusionAttendAndExcitePipeline
-    from .pipeline_stable_diffusion_controlnet import \
-        StableDiffusionControlNetPipeline
-    from .pipeline_stable_diffusion_depth2img import \
-        StableDiffusionDepth2ImgPipeline
-    from .pipeline_stable_diffusion_image_variation import \
-        StableDiffusionImageVariationPipeline
-    from .pipeline_stable_diffusion_img2img import \
-        StableDiffusionImg2ImgPipeline
-    from .pipeline_stable_diffusion_inpaint import \
-        StableDiffusionInpaintPipeline
-    from .pipeline_stable_diffusion_inpaint_legacy import \
-        StableDiffusionInpaintPipelineLegacy
-    from .pipeline_stable_diffusion_instruct_pix2pix import \
-        StableDiffusionInstructPix2PixPipeline
-    from .pipeline_stable_diffusion_k_diffusion import \
-        StableDiffusionKDiffusionPipeline
-    from .pipeline_stable_diffusion_latent_upscale import \
-        StableDiffusionLatentUpscalePipeline
+    from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
+    from .pipeline_stable_diffusion_all_in_one import StableDiffusionPipelineAllinOne
+    from .pipeline_stable_diffusion_attend_and_excite import (
+        StableDiffusionAttendAndExcitePipeline,
+    )
+    from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
+    from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline
+    from .pipeline_stable_diffusion_image_variation import (
+        StableDiffusionImageVariationPipeline,
+    )
+    from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
+    from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+    from .pipeline_stable_diffusion_inpaint_legacy import (
+        StableDiffusionInpaintPipelineLegacy,
+    )
+    from .pipeline_stable_diffusion_instruct_pix2pix import (
+        StableDiffusionInstructPix2PixPipeline,
+    )
+    from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
+    from .pipeline_stable_diffusion_latent_upscale import (
+        StableDiffusionLatentUpscalePipeline,
+    )
     from .pipeline_stable_diffusion_mega import StableDiffusionMegaPipeline
-    from .pipeline_stable_diffusion_model_editing import \
-        StableDiffusionModelEditingPipeline
-    from .pipeline_stable_diffusion_panorama import \
-        StableDiffusionPanoramaPipeline
-    from .pipeline_stable_diffusion_pix2pix_zero import \
-        StableDiffusionPix2PixZeroPipeline
+    from .pipeline_stable_diffusion_model_editing import (
+        StableDiffusionModelEditingPipeline,
+    )
+    from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+    from .pipeline_stable_diffusion_pix2pix_zero import (
+        StableDiffusionPix2PixZeroPipeline,
+    )
     from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
-    from .pipeline_stable_diffusion_upscale import \
-        StableDiffusionUpscalePipeline
+    from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
     from .pipeline_stable_unclip import StableUnCLIPPipeline
     from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
     from .safety_checker import StableDiffusionSafetyChecker
@@ -100,21 +107,26 @@ class StableDiffusionPipelineOutput(BaseOutput):
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_fastdeploy_objects import *  # noqa F403
 else:
-    from .pipeline_fastdeploy_cycle_diffusion import \
-        FastDeployCycleDiffusionPipeline
-    from .pipeline_fastdeploy_stable_diffusion import \
-        FastDeployStableDiffusionPipeline
-    from .pipeline_fastdeploy_stable_diffusion_controlnet import \
-        FastDeployStableDiffusionControlNetPipeline
-    from .pipeline_fastdeploy_stable_diffusion_image_variation import \
-        FastDeployStableDiffusionImageVariationPipeline
-    from .pipeline_fastdeploy_stable_diffusion_img2img import \
-        FastDeployStableDiffusionImg2ImgPipeline
-    from .pipeline_fastdeploy_stable_diffusion_inpaint import \
-        FastDeployStableDiffusionInpaintPipeline
-    from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import \
-        FastDeployStableDiffusionInpaintPipelineLegacy
-    from .pipeline_fastdeploy_stable_diffusion_mega import \
-        FastDeployStableDiffusionMegaPipeline
-    from .pipeline_fastdeploy_stable_diffusion_upscale import \
-        FastDeployStableDiffusionUpscalePipeline
+    from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline
+    from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
+    from .pipeline_fastdeploy_stable_diffusion_controlnet import (
+        FastDeployStableDiffusionControlNetPipeline,
+    )
+    from .pipeline_fastdeploy_stable_diffusion_image_variation import (
+        FastDeployStableDiffusionImageVariationPipeline,
+    )
+    from .pipeline_fastdeploy_stable_diffusion_img2img import (
+        FastDeployStableDiffusionImg2ImgPipeline,
+    )
+    from .pipeline_fastdeploy_stable_diffusion_inpaint import (
+        FastDeployStableDiffusionInpaintPipeline,
+    )
+    from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import (
+        FastDeployStableDiffusionInpaintPipelineLegacy,
+    )
+    from .pipeline_fastdeploy_stable_diffusion_mega import (
+        FastDeployStableDiffusionMegaPipeline,
+    )
+    from .pipeline_fastdeploy_stable_diffusion_upscale import (
+        FastDeployStableDiffusionUpscalePipeline,
+    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 1b9ac762bae8a..3f1cbee1f4454 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -22,21 +22,37 @@
 import numpy as np
 import requests
 from paddlenlp.transformers import (
-    BertTokenizer, CLIPFeatureExtractor, CLIPImageProcessor, CLIPTextModel,
-    CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig,
-    CLIPVisionModelWithProjection)
-
-from ...models import (AutoencoderKL, ControlNetModel, PriorTransformer,
-                       UNet2DConditionModel)
+    BertTokenizer,
+    CLIPFeatureExtractor,
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+    AutoencoderKL,
+    ControlNetModel,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
 from ...schedulers import (
-    DDIMScheduler, DDPMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, UnCLIPScheduler)
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
 from ...utils import is_omegaconf_available, logging
 from ...utils.import_utils import BACKENDS_MAPPING
 from ...utils.load_utils import smart_load
-from ..latent_diffusion.pipeline_latent_diffusion import (LDMBertConfig,
-                                                          LDMBertModel)
+from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..paint_by_example import PaintByExampleImageEncoder
 from ..pipeline_utils import DiffusionPipeline
 from .safety_checker import StableDiffusionSafetyChecker
@@ -70,8 +86,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("emb_layers.1", "time_emb_proj")
         new_item = new_item.replace("skip_connection", "conv_shortcut")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -87,8 +102,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -131,8 +145,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -140,21 +153,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
     attention layers, and takes into account additional replacements that may arise.
 
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -162,13 +174,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
             query, key, value = np.split(old_tensor, 3, axis=1)
 
             checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -179,8 +189,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -190,8 +199,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -212,9 +220,7 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-def create_unet_diffusers_config(original_config,
-                                 image_size: int,
-                                 controlnet=False):
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
@@ -225,34 +231,28 @@ def create_unet_diffusers_config(original_config,
 
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
-    block_out_channels = [
-        unet_params.model_channels * mult for mult in unet_params.channel_mult
-    ]
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnDownBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "DownBlock2D")
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnUpBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "UpBlock2D")
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2**(len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
 
     head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (unet_params.use_linear_in_transformer
-                             if "use_linear_in_transformer" in unet_params else
-                             False)
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
     if use_linear_projection:
         # stable diffusion 2-base-512 and 2-768
         if head_dim is None:
@@ -267,9 +267,7 @@ def create_unet_diffusers_config(original_config,
             assert "adm_in_channels" in unet_params
             projection_class_embeddings_input_dim = unet_params.adm_in_channels
         else:
-            raise NotImplementedError(
-                f"Unknown conditional unet num_classes config: {unet_params.num_classes}"
-            )
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
 
     config = {
         "sample_size": image_size // vae_scale_factor,
@@ -281,8 +279,7 @@ def create_unet_diffusers_config(original_config,
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
         "class_embed_type": class_embed_type,
-        "projection_class_embeddings_input_dim":
-        projection_class_embeddings_input_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
     }
 
     if not controlnet:
@@ -328,7 +325,8 @@ def create_diffusers_schedular(original_config):
         num_train_timesteps=original_config.model.params.timesteps,
         beta_start=original_config.model.params.linear_start,
         beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear", )
+        beta_schedule="scaled_linear",
+    )
     return schedular
 
 
@@ -347,17 +345,19 @@ def create_ldm_bert_config(original_config):
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        pad_token_id=0, )
+        pad_token_id=0,
+    )
     return LDMBertConfig(**config)
 
 
 def convert_ldm_unet_checkpoint(
-        checkpoint,
-        config,
-        path=None,
-        extract_ema=False,
-        controlnet=False,
-        no_unet_key=False, ):
+    checkpoint,
+    config,
+    path=None,
+    extract_ema=False,
+    controlnet=False,
+    no_unet_key=False,
+):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -384,8 +384,7 @@ def convert_ldm_unet_checkpoint(
         for key in keys:
             if key.startswith("model.diffusion_model"):
                 flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                    flat_ema_key)
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
     else:
         if sum(k.startswith("model_ema") for k in keys) > 100:
             print(
@@ -399,34 +398,23 @@ def convert_ldm_unet_checkpoint(
 
     new_checkpoint = {}
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
-        "time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
-        "time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
-        "time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
-        "time_embed.2.bias"]
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
 
     if config["class_embed_type"] is None:
         # No parameters to port
         ...
-    elif (config["class_embed_type"] == "timestep" or
-          config["class_embed_type"] == "projection"):
-        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict[
-            "label_emb.0.0.weight"]
-        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict[
-            "label_emb.0.0.bias"]
-        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict[
-            "label_emb.0.2.weight"]
-        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict[
-            "label_emb.0.2.bias"]
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
     else:
-        raise NotImplementedError(
-            f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
 
-    new_checkpoint["conv_in.weight"] = unet_state_dict[
-        "input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
     if not controlnet:
@@ -436,35 +424,23 @@ def convert_ldm_unet_checkpoint(
         new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "input_blocks" in layer
-    })
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
         for layer_id in range(num_input_blocks)
     }
 
     # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "middle_block" in layer
-    })
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
     middle_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
         for layer_id in range(num_middle_blocks)
     }
 
     # Retrieves the keys for the output blocks only
-    num_output_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "output_blocks" in layer
-    })
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
     output_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
         for layer_id in range(num_output_blocks)
     }
 
@@ -473,21 +449,17 @@ def convert_ldm_unet_checkpoint(
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
 
         resnets = [
-            key for key in input_blocks[i]
-            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
-            key
-        ]
-        attentions = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
         ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
 
         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.weight")
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.bias")
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
 
         paths = renew_resnet_paths(resnets)
         meta_path = {
@@ -499,7 +471,8 @@ def convert_ldm_unet_checkpoint(
             new_checkpoint,
             unet_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
@@ -512,19 +485,18 @@ def convert_ldm_unet_checkpoint(
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
     resnet_1 = middle_blocks[2]
 
     resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(
-        resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
 
     resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(
-        resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
     meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -533,14 +505,13 @@ def convert_ldm_unet_checkpoint(
         new_checkpoint,
         unet_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
 
     for i in range(num_output_blocks):
         block_id = i // (config["layers_per_block"] + 1)
         layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [
-            shave_segments(name, 2) for name in output_blocks[i]
-        ]
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
@@ -551,12 +522,8 @@ def convert_ldm_unet_checkpoint(
                 output_block_list[layer_id] = [layer_name]
 
         if len(output_block_list) > 1:
-            resnets = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
-            ]
-            attentions = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
-            ]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
 
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
@@ -570,22 +537,19 @@ def convert_ldm_unet_checkpoint(
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
-            output_block_list = {
-                k: sorted(v)
-                for k, v in output_block_list.items()
-            }
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
 
             if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(
-                    ["conv.bias", "conv.weight"])
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.weight"]
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.bias"]
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -595,27 +559,28 @@ def convert_ldm_unet_checkpoint(
                 paths = renew_attention_paths(attentions)
                 meta_path = {
                     "old": f"output_blocks.{i}.1",
-                    "new":
-                    f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                 }
                 assign_to_checkpoint(
                     paths,
                     new_checkpoint,
                     unet_state_dict,
                     additional_replacements=[meta_path],
-                    config=config, )
+                    config=config,
+                )
         else:
-            resnet_0_paths = renew_resnet_paths(
-                output_block_layers, n_shave_prefix_segments=1)
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
                 old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join([
-                    "up_blocks",
-                    str(block_id),
-                    "resnets",
-                    str(layer_in_block_id),
-                    path["new"],
-                ])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
@@ -624,48 +589,42 @@ def convert_ldm_unet_checkpoint(
 
         orig_index = 0
 
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight")
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias")
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
 
         orig_index += 2
 
         diffusers_index = 0
 
         while diffusers_index < 6:
-            new_checkpoint[
-                f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
-                    f"input_hint_block.{orig_index}.weight")
-            new_checkpoint[
-                f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
-                    f"input_hint_block.{orig_index}.bias")
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
             diffusers_index += 1
             orig_index += 2
 
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight")
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias")
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
 
         # down blocks
         for i in range(num_input_blocks):
-            new_checkpoint[
-                f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(
-                    f"zero_convs.{i}.0.weight")
-            new_checkpoint[
-                f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(
-                    f"zero_convs.{i}.0.bias")
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
 
         # mid block
-        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop(
-            "middle_block_out.0.weight")
-        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop(
-            "middle_block_out.0.bias")
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
 
     return new_checkpoint
 
@@ -681,107 +640,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -789,58 +715,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -848,13 +766,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint):
     import paddle.nn as nn
 
     need_transpose = []
@@ -880,52 +798,56 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
             bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
 
     new_checkpoint = {}
-    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[
-        "transformer.token_emb.weight"]
-    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[
-        "transformer.pos_emb.emb.weight"]
+    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"]
+    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"]
     for i in range(config.encoder_layers):
         double_i = 2 * i
         double_i_plus1 = 2 * i + 1
         # convert norm
         new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.weight"]
+            f"transformer.attn_layers.layers.{double_i}.0.weight"
+        ]
         new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.bias"]
-
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"]
+            f"transformer.attn_layers.layers.{double_i}.0.bias"
+        ]
+
+        new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+        ]
 
         new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"]
+            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+        ]
         new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"]
+            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+        ]
         new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].T
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"
+        ].T
         new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"]
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+        ]
         new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].T
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"
+        ].T
         new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].T
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
+        ].T
 
-    new_checkpoint["final_layer_norm.weight"] = bert_state_dict[
-        "transformer.norm.weight"]
-    new_checkpoint["final_layer_norm.bias"] = bert_state_dict[
-        "transformer.norm.bias"]
+    new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"]
+    new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"]
     ldmbert = LDMBertModel(config)
     ldmbert.eval()
     ldmbert.load_dict(new_checkpoint)
@@ -942,12 +864,10 @@ def convert_ldm_clip_checkpoint(checkpoint):
 
     for key in keys:
         if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len(
-                "cond_stage_model.transformer."):]] = checkpoint[key]
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
 
     if len(text_model_dict) > 0:
-        text_model.load_dict(
-            CLIPTextModel.smart_convert(text_model_dict, text_model))
+        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
 
     return text_model
 
@@ -955,14 +875,14 @@ def convert_ldm_clip_checkpoint(checkpoint):
 textenc_conversion_lst = [
     (
         "cond_stage_model.model.positional_embedding",
-        "text_model.embeddings.position_embedding.weight", ),
+        "text_model.embeddings.position_embedding.weight",
+    ),
     (
         "cond_stage_model.model.token_embedding.weight",
-        "text_model.embeddings.token_embedding.weight", ),
-    ("cond_stage_model.model.ln_final.weight",
-     "text_model.final_layer_norm.weight"),
-    ("cond_stage_model.model.ln_final.bias",
-     "text_model.final_layer_norm.bias"),
+        "text_model.embeddings.token_embedding.weight",
+    ),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
 ]
 textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
 
@@ -977,10 +897,12 @@ def convert_ldm_clip_checkpoint(checkpoint):
     ("ln_final.", "transformer.text_model.final_layer_norm."),
     (
         "token_embedding.weight",
-        "transformer.text_model.embeddings.token_embedding.weight", ),
+        "transformer.text_model.embeddings.token_embedding.weight",
+    ),
     (
         "positional_embedding",
-        "transformer.text_model.embeddings.position_embedding.weight", ),
+        "transformer.text_model.embeddings.position_embedding.weight",
+    ),
 ]
 protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
 textenc_pattern = re.compile("|".join(protected.keys()))
@@ -997,12 +919,11 @@ def convert_paint_by_example_checkpoint(checkpoint):
 
     for key in keys:
         if key.startswith("cond_stage_model.transformer"):
-            model_dict[key[len("cond_stage_model.transformer."):]] = checkpoint[
-                key]
+            model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
 
     # load mapper
     keys_mapper = {
-        k[len("cond_stage_model.mapper.res"):]: v
+        k[len("cond_stage_model.mapper.res") :]: v
         for k, v in checkpoint.items()
         if k.startswith("cond_stage_model.mapper")
     }
@@ -1017,7 +938,7 @@ def convert_paint_by_example_checkpoint(checkpoint):
     }
 
     for key, value in keys_mapper.items():
-        prefix = key[:len("blocks.i")]
+        prefix = key[: len("blocks.i")]
         suffix = key.split(prefix)[-1].split(".")[-1]
         name = key.split(prefix)[-1].split(suffix)[0][1:-1]
         mapped_names = MAPPING[name]
@@ -1026,13 +947,11 @@ def convert_paint_by_example_checkpoint(checkpoint):
         for i, mapped_name in enumerate(mapped_names):
             new_name = ".".join([prefix, mapped_name, suffix])
             shape = value.shape[0] // num_splits
-            model_dict[new_name] = value[i * shape:(i + 1) * shape]
+            model_dict[new_name] = value[i * shape : (i + 1) * shape]
 
     # load final layer norm
-    model_dict["final_layer_norm.bias"] = checkpoint[
-        "cond_stage_model.final_ln.bias"]
-    model_dict["final_layer_norm.weight"] = checkpoint[
-        "cond_stage_model.final_ln.bias"]
+    model_dict["final_layer_norm.bias"] = checkpoint["cond_stage_model.final_ln.bias"]
+    model_dict["final_layer_norm.weight"] = checkpoint["cond_stage_model.final_ln.bias"]
 
     # load proj_out
     model_dict["proj_out.bias"] = checkpoint["proj_out.bias"]
@@ -1042,64 +961,50 @@ def convert_paint_by_example_checkpoint(checkpoint):
     model_dict["uncond_vector"] = checkpoint["learnable_vector"]
 
     if len(model_dict) > 0:
-        model.load_dict(
-            PaintByExampleImageEncoder.smart_convert(model_dict, model))
+        model.load_dict(PaintByExampleImageEncoder.smart_convert(model_dict, model))
 
     return model
 
 
 def convert_open_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained(
-        "stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
     text_model.eval()
     keys = list(checkpoint.keys())
 
     text_model_dict = {}
 
     if "cond_stage_model.model.text_projection" in checkpoint:
-        d_model = int(checkpoint["cond_stage_model.model.text_projection"]
-                      .shape[0])
+        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
     else:
         d_model = 1024
 
     # text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
 
     for key in keys:
-        if ("resblocks.23" in
-                key):  # Diffusers drops the final layer and only uses the penultimate layer
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
             continue
         if key in textenc_conversion_map:
             text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
         if key.startswith("cond_stage_model.model.transformer."):
-            new_key = key[len("cond_stage_model.model.transformer."):]
+            new_key = key[len("cond_stage_model.model.transformer.") :]
             if new_key.endswith(".in_proj_weight"):
-                new_key = new_key[:-len(".in_proj_weight")]
-                new_key = textenc_pattern.sub(
-                    lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[
-                    key][:d_model, :]
-                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][
-                    d_model:d_model * 2, :]
-                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][
-                    d_model * 2:, :]
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
             elif new_key.endswith(".in_proj_bias"):
-                new_key = new_key[:-len(".in_proj_bias")]
-                new_key = textenc_pattern.sub(
-                    lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[
-                    key][:d_model]
-                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][
-                    d_model:d_model * 2]
-                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][
-                    d_model * 2:]
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
             else:
-                new_key = textenc_pattern.sub(
-                    lambda m: protected[re.escape(m.group(0))], new_key)
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
 
                 text_model_dict[new_key] = checkpoint[key]
     if len(text_model_dict) > 0:
-        text_model.load_dict(
-            CLIPTextModel.smart_convert(text_model_dict, text_model))
+        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
     return text_model
 
 
@@ -1121,17 +1026,13 @@ def stable_unclip_image_encoder(original_config):
 
         if clip_model_name == "ViT-L/14":
             feature_extractor = CLIPImageProcessor()
-            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-                "openai/clip-vit-large-patch14")
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
         else:
-            raise NotImplementedError(
-                f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}"
-            )
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
 
     elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
         feature_extractor = CLIPImageProcessor()
-        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
     else:
         raise NotImplementedError(
             f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
@@ -1141,8 +1042,9 @@ def stable_unclip_image_encoder(original_config):
 
 
 def stable_unclip_image_noising_components(
-        original_config,
-        clip_stats_path: Optional[str]=None, ):
+    original_config,
+    clip_stats_path: Optional[str] = None,
+):
     """
     Returns the noising components for the img2img and txt2img unclip pipelines.
 
@@ -1162,15 +1064,12 @@ def stable_unclip_image_noising_components(
         max_noise_level = noise_aug_config.noise_schedule_config.timesteps
         beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
 
-        image_normalizer = StableUnCLIPImageNormalizer(
-            embedding_dim=embedding_dim)
-        image_noising_scheduler = DDPMScheduler(
-            num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
 
         if "clip_stats_path" in noise_aug_config:
             if clip_stats_path is None:
-                raise ValueError(
-                    "This stable unclip config requires a `clip_stats_path`")
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
 
             from ...utils import torch_load
 
@@ -1189,22 +1088,21 @@ def stable_unclip_image_noising_components(
 
             image_normalizer.load_dict(clip_stats_state_dict)
     else:
-        raise NotImplementedError(
-            f"Unknown noise augmentor class: {noise_aug_class}")
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
     image_normalizer.eval()
     return image_normalizer, image_noising_scheduler
 
 
 def convert_controlnet_checkpoint(
-        checkpoint,
-        original_config,
-        checkpoint_path,
-        image_size,
-        upcast_attention,
-        extract_ema,
-        no_unet_key=False, ):
-    ctrlnet_config = create_unet_diffusers_config(
-        original_config, image_size=image_size, controlnet=True)
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    no_unet_key=False,
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
     ctrlnet_config["upcast_attention"] = upcast_attention
 
     ctrlnet_config.pop("sample_size")
@@ -1217,33 +1115,33 @@ def convert_controlnet_checkpoint(
         path=checkpoint_path,
         extract_ema=extract_ema,
         controlnet=True,
-        no_unet_key=no_unet_key, )
+        no_unet_key=no_unet_key,
+    )
 
-    controlnet_model.load_dict(
-        convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model,
-                                                  converted_ctrl_checkpoint))
+    controlnet_model.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint))
     controlnet_model.eval()
     return controlnet_model
 
 
 def download_from_original_stable_diffusion_ckpt(
-        checkpoint_path: str,
-        original_config_file: str=None,
-        image_size: int=512,
-        prediction_type: str=None,
-        model_type: str=None,
-        extract_ema: bool=False,
-        scheduler_type: str="pndm",
-        num_in_channels: Optional[int]=None,
-        upcast_attention: Optional[bool]=None,
-        stable_unclip: Optional[str]=None,
-        stable_unclip_prior: Optional[str]=None,
-        clip_stats_path: Optional[str]=None,
-        controlnet: Optional[bool]=None,
-        load_safety_checker: bool=True,
-        pipeline_class: DiffusionPipeline=None,
-        paddle_dtype=None,
-        **kwargs, ) -> DiffusionPipeline:
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 512,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    stable_unclip: Optional[str] = None,
+    stable_unclip_prior: Optional[str] = None,
+    clip_stats_path: Optional[str] = None,
+    controlnet: Optional[bool] = None,
+    load_safety_checker: bool = True,
+    pipeline_class: DiffusionPipeline = None,
+    paddle_dtype=None,
+    **kwargs,
+) -> DiffusionPipeline:
     """
     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
     config file.
@@ -1288,10 +1186,14 @@ def download_from_original_stable_diffusion_ckpt(
     """
 
     # import pipelines here to avoid circular import error when using from_ckpt method
-    from ppdiffusers import (LDMTextToImagePipeline, PaintByExamplePipeline,
-                             StableDiffusionControlNetPipeline,
-                             StableDiffusionPipeline,
-                             StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline)
+    from ppdiffusers import (
+        LDMTextToImagePipeline,
+        PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionPipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
 
     if pipeline_class is None or pipeline_class.__name__ == "DiffusionPipeline":
         pipeline_class = StableDiffusionPipeline
@@ -1304,8 +1206,7 @@ def download_from_original_stable_diffusion_ckpt(
 
     from omegaconf import OmegaConf
 
-    checkpoint = smart_load(
-        checkpoint_path, return_numpy=True, return_global_step=True)
+    checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True)
 
     # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
     # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
@@ -1347,11 +1248,12 @@ def download_from_original_stable_diffusion_ckpt(
     original_config = OmegaConf.load(original_config_file)
 
     if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"][
-            "in_channels"] = num_in_channels
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
 
-    if ("parameterization" in original_config["model"]["params"] and
-            original_config["model"]["params"]["parameterization"] == "v"):
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
         if prediction_type is None:
             # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
             # as it relies on a brittle global step parameter here
@@ -1376,7 +1278,8 @@ def download_from_original_stable_diffusion_ckpt(
             checkpoint_path,
             image_size,
             upcast_attention,
-            extract_ema, )
+            extract_ema,
+        )
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
     beta_end = original_config.model.params.linear_end
@@ -1389,7 +1292,8 @@ def download_from_original_stable_diffusion_ckpt(
         steps_offset=1,
         clip_sample=False,
         set_alpha_to_one=False,
-        prediction_type=prediction_type, )
+        prediction_type=prediction_type,
+    )
     # make sure scheduler works correctly with DDIM
     scheduler.register_to_config(clip_sample=False)
 
@@ -1404,8 +1308,7 @@ def download_from_original_stable_diffusion_ckpt(
     elif scheduler_type == "euler":
         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
     elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            scheduler.config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
     elif scheduler_type == "dpm":
         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
     elif scheduler_type == "ddim":
@@ -1414,40 +1317,31 @@ def download_from_original_stable_diffusion_ckpt(
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
     # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(
-        original_config, image_size=image_size)
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     unet_config["upcast_attention"] = upcast_attention
     unet = UNet2DConditionModel(**unet_config)
     unet.eval()
 
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema)
-    unet.load_dict(
-        convert_diffusers_vae_unet_to_ppdiffusers(unet,
-                                                  converted_unet_checkpoint))
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+    unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint))
 
     # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(
-        original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
-                                                          vae_config)
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
     vae = AutoencoderKL(**vae_config)
     vae.eval()
-    vae.load_dict(
-        convert_diffusers_vae_unet_to_ppdiffusers(vae,
-                                                  converted_vae_checkpoint))
+    vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint))
 
     # Convert the text model.
     if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(
-            ".")[-1]
-        logger.debug(
-            f"no `model_type` given, `model_type` inferred as: {model_type}")
+        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
     if model_type == "FrozenOpenCLIPEmbedder":
         text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "stabilityai/stable-diffusion-2/tokenizer")
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer")
 
         if stable_unclip is None:
             if controlnet:
@@ -1460,7 +1354,8 @@ def download_from_original_stable_diffusion_ckpt(
                     controlnet=controlnet_model,
                     safety_checker=None,
                     feature_extractor=None,
-                    requires_safety_checker=False, )
+                    requires_safety_checker=False,
+                )
             else:
                 pipe = pipeline_class(
                     vae=vae,
@@ -1470,18 +1365,16 @@ def download_from_original_stable_diffusion_ckpt(
                     scheduler=scheduler,
                     safety_checker=None,
                     feature_extractor=None,
-                    requires_safety_checker=False, )
+                    requires_safety_checker=False,
+                )
         else:
-            (
-                image_normalizer,
-                image_noising_scheduler,
-            ) = stable_unclip_image_noising_components(
+            (image_normalizer, image_noising_scheduler,) = stable_unclip_image_noising_components(
                 original_config,
-                clip_stats_path=clip_stats_path, )
+                clip_stats_path=clip_stats_path,
+            )
 
             if stable_unclip == "img2img":
-                feature_extractor, image_encoder = stable_unclip_image_encoder(
-                    original_config)
+                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
 
                 pipe = StableUnCLIPImg2ImgPipeline(
                     # image encoding components
@@ -1496,26 +1389,20 @@ def download_from_original_stable_diffusion_ckpt(
                     unet=unet,
                     scheduler=scheduler,
                     # vae
-                    vae=vae, )
+                    vae=vae,
+                )
             elif stable_unclip == "txt2img":
                 if stable_unclip_prior is None or stable_unclip_prior == "karlo":
                     karlo_model = "kakaobrain/karlo-v1-alpha"
-                    prior = PriorTransformer.from_pretrained(
-                        karlo_model, subfolder="prior")
-
-                    prior_tokenizer = CLIPTokenizer.from_pretrained(
-                        "openai/clip-vit-large-patch14")
-                    prior_text_model = CLIPTextModelWithProjection.from_pretrained(
-                        "openai/clip-vit-large-patch14")
-
-                    prior_scheduler = UnCLIPScheduler.from_pretrained(
-                        karlo_model, subfolder="prior_scheduler")
-                    prior_scheduler = DDPMScheduler.from_config(
-                        prior_scheduler.config)
+                    prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior")
+
+                    prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+                    prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+
+                    prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler")
+                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
                 else:
-                    raise NotImplementedError(
-                        f"unknown prior for stable unclip model: {stable_unclip_prior}"
-                    )
+                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
 
                 pipe = StableUnCLIPPipeline(
                     # prior components
@@ -1532,33 +1419,29 @@ def download_from_original_stable_diffusion_ckpt(
                     unet=unet,
                     scheduler=scheduler,
                     # vae
-                    vae=vae, )
+                    vae=vae,
+                )
             else:
-                raise NotImplementedError(
-                    f"unknown `stable_unclip` type: {stable_unclip}")
+                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
     elif model_type == "PaintByExample":
         vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "openai/clip-vit-large-patch14")
-        feature_extractor = CLIPFeatureExtractor.from_pretrained(
-            "CompVis/stable-diffusion-safety-checker")
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
         pipe = PaintByExamplePipeline(
             vae=vae,
             image_encoder=vision_model,
             unet=unet,
             scheduler=scheduler,
             safety_checker=None,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
     elif model_type == "FrozenCLIPEmbedder":
         text_model = convert_ldm_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "openai/clip-vit-large-patch14")
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 
         if load_safety_checker:
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker")
-            feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker")
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+            feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
         else:
             safety_checker = None
             feature_extractor = None
@@ -1573,7 +1456,8 @@ def download_from_original_stable_diffusion_ckpt(
                 scheduler=scheduler,
                 safety_checker=safety_checker,
                 feature_extractor=feature_extractor,
-                requires_safety_checker=load_safety_checker, )
+                requires_safety_checker=load_safety_checker,
+            )
         else:
             pipe = pipeline_class(
                 vae=vae,
@@ -1583,19 +1467,20 @@ def download_from_original_stable_diffusion_ckpt(
                 scheduler=scheduler,
                 safety_checker=safety_checker,
                 feature_extractor=feature_extractor,
-                requires_safety_checker=load_safety_checker, )
+                requires_safety_checker=load_safety_checker,
+            )
     else:
         text_config = create_ldm_bert_config(original_config)
         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizer.from_pretrained(
-            "bert-base-uncased", model_max_length=77)
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77)
 
         pipe = LDMTextToImagePipeline(
             vqvae=vae,
             bert=text_model,
             tokenizer=tokenizer,
             unet=unet,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
     if paddle_dtype is not None:
         pipe.to(paddle_dtype=paddle_dtype)
 
@@ -1603,13 +1488,14 @@ def download_from_original_stable_diffusion_ckpt(
 
 
 def download_controlnet_from_original_ckpt(
-        checkpoint_path: str,
-        original_config_file: str,
-        image_size: int=512,
-        extract_ema: bool=False,
-        num_in_channels: Optional[int]=None,
-        upcast_attention: Optional[bool]=None,
-        no_unet_key: Optional[bool]=False, ) -> DiffusionPipeline:
+    checkpoint_path: str,
+    original_config_file: str,
+    image_size: int = 512,
+    extract_ema: bool = False,
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    no_unet_key: Optional[bool] = False,
+) -> DiffusionPipeline:
     if not is_omegaconf_available():
         raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
 
@@ -1636,12 +1522,10 @@ def download_controlnet_from_original_ckpt(
     original_config = OmegaConf.load(original_config_file)
 
     if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"][
-            "in_channels"] = num_in_channels
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
 
     if "control_stage_config" not in original_config.model.params:
-        raise ValueError(
-            "`control_stage_config` not present in original config")
+        raise ValueError("`control_stage_config` not present in original config")
 
     controlnet_model = convert_controlnet_checkpoint(
         checkpoint,
@@ -1650,6 +1534,7 @@ def download_controlnet_from_original_ckpt(
         image_size,
         upcast_attention,
         extract_ema,
-        no_unet_key, )
+        no_unet_key,
+    )
 
     return controlnet_model
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
index fc8dfda8a0781..4a8c6336fd55d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
@@ -20,17 +20,32 @@
 
 import numpy as np
 import requests
-from paddlenlp.transformers import (BertTokenizer, CLIPFeatureExtractor,
-                                    CLIPTextModel, CLIPTokenizer)
+from paddlenlp.transformers import (
+    BertTokenizer,
+    CLIPFeatureExtractor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
 
 from ppdiffusers import (
-    AutoencoderKL, ControlNetModel, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, LDMTextToImagePipeline, LMSDiscreteScheduler,
-    PNDMScheduler, StableDiffusionControlNetPipeline, StableDiffusionPipeline,
-    UNet2DConditionModel)
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-    LDMBertConfig, LDMBertModel)
+    LDMBertConfig,
+    LDMBertModel,
+)
 from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 
 from ...utils import is_omegaconf_available, logging
@@ -65,8 +80,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("emb_layers.1", "time_emb_proj")
         new_item = new_item.replace("skip_connection", "conv_shortcut")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -82,8 +96,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -126,8 +139,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -135,21 +147,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
     attention layers, and takes into account additional replacements that may arise.
 
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -157,13 +168,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
             query, key, value = np.split(old_tensor, 3, axis=1)
 
             checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -174,8 +183,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -185,8 +193,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -207,9 +214,7 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-def create_unet_diffusers_config(original_config,
-                                 image_size: int,
-                                 controlnet=False):
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
@@ -220,34 +225,28 @@ def create_unet_diffusers_config(original_config,
 
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
-    block_out_channels = [
-        unet_params.model_channels * mult for mult in unet_params.channel_mult
-    ]
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnDownBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "DownBlock2D")
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnUpBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "UpBlock2D")
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2**(len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
 
     head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (unet_params.use_linear_in_transformer
-                             if "use_linear_in_transformer" in unet_params else
-                             False)
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
     if use_linear_projection:
         # stable diffusion 2-base-512 and 2-768
         if head_dim is None:
@@ -262,9 +261,7 @@ def create_unet_diffusers_config(original_config,
             assert "adm_in_channels" in unet_params
             projection_class_embeddings_input_dim = unet_params.adm_in_channels
         else:
-            raise NotImplementedError(
-                f"Unknown conditional unet num_classes config: {unet_params.num_classes}"
-            )
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
 
     config = dict(
         sample_size=image_size // vae_scale_factor,
@@ -304,7 +301,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks, )
+        layers_per_block=vae_params.num_res_blocks,
+    )
     return config
 
 
@@ -330,15 +328,12 @@ def create_ldm_bert_config(original_config):
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        pad_token_id=0, )
+        pad_token_id=0,
+    )
     return LDMBertConfig(**config)
 
 
-def convert_ldm_unet_checkpoint(checkpoint,
-                                config,
-                                path=None,
-                                extract_ema=False,
-                                controlnet=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -362,8 +357,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
         for key in keys:
             if key.startswith(unet_key[:-1]):
                 flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                    flat_ema_key)
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
     else:
         if sum(k.startswith("model_ema") for k in keys) > 100:
             print(
@@ -377,34 +371,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
 
     new_checkpoint = {}
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
-        "time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
-        "time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
-        "time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
-        "time_embed.2.bias"]
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
 
     if config["class_embed_type"] is None:
         # No parameters to port
         ...
-    elif (config["class_embed_type"] == "timestep" or
-          config["class_embed_type"] == "projection"):
-        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict[
-            "label_emb.0.0.weight"]
-        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict[
-            "label_emb.0.0.bias"]
-        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict[
-            "label_emb.0.2.weight"]
-        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict[
-            "label_emb.0.2.bias"]
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
     else:
-        raise NotImplementedError(
-            f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
 
-    new_checkpoint["conv_in.weight"] = unet_state_dict[
-        "input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
     if not controlnet:
@@ -414,35 +397,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
         new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "input_blocks" in layer
-    })
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
         for layer_id in range(num_input_blocks)
     }
 
     # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "middle_block" in layer
-    })
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
     middle_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
         for layer_id in range(num_middle_blocks)
     }
 
     # Retrieves the keys for the output blocks only
-    num_output_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "output_blocks" in layer
-    })
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
     output_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
         for layer_id in range(num_output_blocks)
     }
 
@@ -451,21 +422,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
 
         resnets = [
-            key for key in input_blocks[i]
-            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
-            key
-        ]
-        attentions = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
         ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
 
         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.weight")
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.bias")
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
 
         paths = renew_resnet_paths(resnets)
         meta_path = {
@@ -477,7 +444,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
             new_checkpoint,
             unet_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
@@ -490,19 +458,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
     resnet_1 = middle_blocks[2]
 
     resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(
-        resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
 
     resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(
-        resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
     meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -511,14 +478,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
         new_checkpoint,
         unet_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
 
     for i in range(num_output_blocks):
         block_id = i // (config["layers_per_block"] + 1)
         layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [
-            shave_segments(name, 2) for name in output_blocks[i]
-        ]
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
@@ -529,12 +495,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 output_block_list[layer_id] = [layer_name]
 
         if len(output_block_list) > 1:
-            resnets = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
-            ]
-            attentions = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
-            ]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
 
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
@@ -548,22 +510,19 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
-            output_block_list = {
-                k: sorted(v)
-                for k, v in output_block_list.items()
-            }
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
 
             if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(
-                    ["conv.bias", "conv.weight"])
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.weight"]
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.bias"]
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -573,27 +532,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 paths = renew_attention_paths(attentions)
                 meta_path = {
                     "old": f"output_blocks.{i}.1",
-                    "new":
-                    f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                 }
                 assign_to_checkpoint(
                     paths,
                     new_checkpoint,
                     unet_state_dict,
                     additional_replacements=[meta_path],
-                    config=config, )
+                    config=config,
+                )
         else:
-            resnet_0_paths = renew_resnet_paths(
-                output_block_layers, n_shave_prefix_segments=1)
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
                 old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join([
-                    "up_blocks",
-                    str(block_id),
-                    "resnets",
-                    str(layer_in_block_id),
-                    path["new"],
-                ])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
@@ -602,48 +562,42 @@ def convert_ldm_unet_checkpoint(checkpoint,
 
         orig_index = 0
 
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight")
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias")
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
 
         orig_index += 2
 
         diffusers_index = 0
 
         while diffusers_index < 6:
-            new_checkpoint[
-                f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
-                    f"input_hint_block.{orig_index}.weight")
-            new_checkpoint[
-                f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
-                    f"input_hint_block.{orig_index}.bias")
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
             diffusers_index += 1
             orig_index += 2
 
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight")
-        new_checkpoint[
-            "controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias")
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
 
         # down blocks
         for i in range(num_input_blocks):
-            new_checkpoint[
-                f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(
-                    f"zero_convs.{i}.0.weight")
-            new_checkpoint[
-                f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(
-                    f"zero_convs.{i}.0.bias")
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
 
         # mid block
-        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop(
-            "middle_block_out.0.weight")
-        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop(
-            "middle_block_out.0.bias")
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
 
     return new_checkpoint
 
@@ -659,107 +613,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -767,58 +688,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -826,13 +739,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint):
     import paddle.nn as nn
 
     need_transpose = []
@@ -858,52 +771,56 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
             bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
 
     new_checkpoint = {}
-    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[
-        "transformer.token_emb.weight"]
-    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[
-        "transformer.pos_emb.emb.weight"]
+    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"]
+    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"]
     for i in range(config.encoder_layers):
         double_i = 2 * i
         double_i_plus1 = 2 * i + 1
         # convert norm
         new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.weight"]
+            f"transformer.attn_layers.layers.{double_i}.0.weight"
+        ]
         new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.bias"]
-
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].T
-        new_checkpoint[
-            f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
-                f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"]
+            f"transformer.attn_layers.layers.{double_i}.0.bias"
+        ]
+
+        new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"
+        ].T
+        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
+            f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
+        ]
 
         new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"]
+            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
+        ]
         new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"]
+            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
+        ]
         new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].T
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"
+        ].T
         new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"]
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
+        ]
         new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].T
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"
+        ].T
         new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].T
+            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
+        ].T
 
-    new_checkpoint["final_layer_norm.weight"] = bert_state_dict[
-        "transformer.norm.weight"]
-    new_checkpoint["final_layer_norm.bias"] = bert_state_dict[
-        "transformer.norm.bias"]
+    new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"]
+    new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"]
     ldmbert = LDMBertModel(config)
     ldmbert.eval()
     ldmbert.load_dict(new_checkpoint)
@@ -911,8 +828,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
 
 
 def convert_ldm_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained(
-        "CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
+    text_model = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
     text_model.eval()
 
     keys = list(checkpoint.keys())
@@ -921,12 +837,10 @@ def convert_ldm_clip_checkpoint(checkpoint):
 
     for key in keys:
         if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len(
-                "cond_stage_model.transformer."):]] = checkpoint[key]
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
 
     if len(text_model_dict) > 0:
-        text_model.load_dict(
-            CLIPTextModel.smart_convert(text_model_dict, text_model))
+        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
 
     return text_model
 
@@ -934,14 +848,14 @@ def convert_ldm_clip_checkpoint(checkpoint):
 textenc_conversion_lst = [
     (
         "cond_stage_model.model.positional_embedding",
-        "text_model.embeddings.position_embedding.weight", ),
+        "text_model.embeddings.position_embedding.weight",
+    ),
     (
         "cond_stage_model.model.token_embedding.weight",
-        "text_model.embeddings.token_embedding.weight", ),
-    ("cond_stage_model.model.ln_final.weight",
-     "text_model.final_layer_norm.weight"),
-    ("cond_stage_model.model.ln_final.bias",
-     "text_model.final_layer_norm.bias"),
+        "text_model.embeddings.token_embedding.weight",
+    ),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
 ]
 textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
 
@@ -956,83 +870,73 @@ def convert_ldm_clip_checkpoint(checkpoint):
     ("ln_final.", "transformer.text_model.final_layer_norm."),
     (
         "token_embedding.weight",
-        "transformer.text_model.embeddings.token_embedding.weight", ),
+        "transformer.text_model.embeddings.token_embedding.weight",
+    ),
     (
         "positional_embedding",
-        "transformer.text_model.embeddings.position_embedding.weight", ),
+        "transformer.text_model.embeddings.position_embedding.weight",
+    ),
 ]
 protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
 textenc_pattern = re.compile("|".join(protected.keys()))
 
 
 def convert_open_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained(
-        "stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
     text_model.eval()
     keys = list(checkpoint.keys())
 
     text_model_dict = {}
 
     if "cond_stage_model.model.text_projection" in checkpoint:
-        d_model = int(checkpoint["cond_stage_model.model.text_projection"]
-                      .shape[0])
+        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
     else:
         d_model = 1024
 
     for key in keys:
-        if ("resblocks.23" in
-                key):  # Diffusers drops the final layer and only uses the penultimate layer
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
             continue
         if key in textenc_conversion_map:
             text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
         if key.startswith("cond_stage_model.model.transformer."):
-            new_key = key[len("cond_stage_model.model.transformer."):]
+            new_key = key[len("cond_stage_model.model.transformer.") :]
             if new_key.endswith(".in_proj_weight"):
-                new_key = new_key[:-len(".in_proj_weight")]
-                new_key = textenc_pattern.sub(
-                    lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[
-                    key][:d_model, :]
-                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][
-                    d_model:d_model * 2, :]
-                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][
-                    d_model * 2:, :]
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
             elif new_key.endswith(".in_proj_bias"):
-                new_key = new_key[:-len(".in_proj_bias")]
-                new_key = textenc_pattern.sub(
-                    lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[
-                    key][:d_model]
-                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][
-                    d_model:d_model * 2]
-                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][
-                    d_model * 2:]
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
             else:
-                new_key = textenc_pattern.sub(
-                    lambda m: protected[re.escape(m.group(0))], new_key)
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
 
                 text_model_dict[new_key] = checkpoint[key]
     if len(text_model_dict) > 0:
-        text_model.load_dict(
-            CLIPTextModel.smart_convert(text_model_dict, text_model))
+        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
     return text_model
 
 
 def load_pipeline_from_original_stable_diffusion_ckpt(
-        checkpoint_path: str,
-        original_config_file: str=None,
-        image_size: int=512,
-        prediction_type: str=None,
-        model_type: str=None,
-        extract_ema: bool=False,
-        scheduler_type: str="pndm",
-        num_in_channels: Optional[int]=None,
-        upcast_attention: Optional[bool]=None,
-        paddle_dtype: Optional[bool]=None,
-        requires_safety_checker: bool=False,
-        controlnet: Optional[bool]=None,
-        cls=None,
-        **kwargs, ) -> StableDiffusionPipeline:
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 512,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    paddle_dtype: Optional[bool] = None,
+    requires_safety_checker: bool = False,
+    controlnet: Optional[bool] = None,
+    cls=None,
+    **kwargs,
+) -> StableDiffusionPipeline:
     """
     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
     config file.
@@ -1079,8 +983,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
 
     from omegaconf import OmegaConf
 
-    checkpoint = smart_load(
-        checkpoint_path, return_numpy=True, return_global_step=True)
+    checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True)
 
     global_step = int(checkpoint.pop("global_step", -1))
 
@@ -1106,8 +1009,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
             key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
 
             original_config_file = os.path.join(tmpdir, "inference.yaml")
-            if key_name in checkpoint and checkpoint[key_name].shape[
-                    -1] == 1024:
+            if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
                 if not os.path.isfile("v2-inference-v.yaml"):
                     # model_type = "v2"
                     r = requests.get(
@@ -1129,11 +1031,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         original_config = OmegaConf.load(original_config_file)
 
     if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"][
-            "in_channels"] = num_in_channels
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
 
-    if ("parameterization" in original_config["model"]["params"] and
-            original_config["model"]["params"]["parameterization"] == "v"):
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
         if prediction_type is None:
             # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
             # as it relies on a brittle global step parameter here
@@ -1160,7 +1063,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         steps_offset=1,
         clip_sample=False,
         set_alpha_to_one=False,
-        prediction_type=prediction_type, )
+        prediction_type=prediction_type,
+    )
     # make sure scheduler works correctly with DDIM
     scheduler.register_to_config(clip_sample=False)
 
@@ -1175,8 +1079,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
     elif scheduler_type == "euler":
         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
     elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            scheduler.config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
     elif scheduler_type == "dpm":
         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
     elif scheduler_type == "ddim":
@@ -1185,44 +1088,35 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
     # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(
-        original_config, image_size=image_size)
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     unet_config["upcast_attention"] = upcast_attention
     unet = UNet2DConditionModel(**unet_config)
     unet.eval()
 
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema)
-    unet.load_dict(
-        convert_diffusers_vae_unet_to_ppdiffusers(unet,
-                                                  converted_unet_checkpoint))
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+    unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint))
 
     # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(
-        original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
-                                                          vae_config)
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
     vae = AutoencoderKL(**vae_config)
     vae.eval()
-    vae.load_dict(
-        convert_diffusers_vae_unet_to_ppdiffusers(vae,
-                                                  converted_vae_checkpoint))
+    vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint))
 
     # Convert the text model.
     if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(
-            ".")[-1]
-        logger.debug(
-            f"no `model_type` given, `model_type` inferred as: {model_type}")
+        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
 
     if controlnet is None:
         controlnet = "control_stage_config" in original_config.model.params
 
     if model_type == "FrozenOpenCLIPEmbedder":
         text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "stabilityai/stable-diffusion-2/tokenizer")
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer")
 
         if paddle_dtype is not None:
             vae.to(dtype=paddle_dtype)
@@ -1231,8 +1125,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
 
         if controlnet:
             # Convert the ControlNetModel model.
-            ctrlnet_config = create_unet_diffusers_config(
-                original_config, image_size=image_size, controlnet=True)
+            ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
             ctrlnet_config["upcast_attention"] = upcast_attention
 
             ctrlnet_config.pop("sample_size")
@@ -1245,10 +1138,11 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
                 ctrlnet_config,
                 path=checkpoint_path,
                 extract_ema=extract_ema,
-                controlnet=True, )
+                controlnet=True,
+            )
             controlnet_model.load_dict(
-                convert_diffusers_vae_unet_to_ppdiffusers(
-                    controlnet_model, converted_ctrl_checkpoint))
+                convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)
+            )
 
             if paddle_dtype is not None:
                 controlnet_model.to(dtype=paddle_dtype)
@@ -1262,7 +1156,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
                 scheduler=scheduler,
                 safety_checker=None,
                 feature_extractor=None,
-                requires_safety_checker=False, )
+                requires_safety_checker=False,
+            )
         else:
             pipe = cls(
                 vae=vae,
@@ -1272,17 +1167,19 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
                 scheduler=scheduler,
                 safety_checker=None,
                 feature_extractor=None,
-                requires_safety_checker=False, )
+                requires_safety_checker=False,
+            )
 
     elif model_type == "FrozenCLIPEmbedder":
         text_model = convert_ldm_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "CompVis/stable-diffusion-v1-4/tokenizer")
+        tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4/tokenizer")
         if requires_safety_checker:
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-v1-4", subfolder="safety_checker")
+                "CompVis/stable-diffusion-v1-4", subfolder="safety_checker"
+            )
             feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor")
+                "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor"
+            )
         else:
             safety_checker = feature_extractor = None
 
@@ -1295,8 +1192,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
 
         if controlnet:
             # Convert the ControlNetModel model.
-            ctrlnet_config = create_unet_diffusers_config(
-                original_config, image_size=image_size, controlnet=True)
+            ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
             ctrlnet_config["upcast_attention"] = upcast_attention
 
             ctrlnet_config.pop("sample_size")
@@ -1309,10 +1205,11 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
                 ctrlnet_config,
                 path=checkpoint_path,
                 extract_ema=extract_ema,
-                controlnet=True, )
+                controlnet=True,
+            )
             controlnet_model.load_dict(
-                convert_diffusers_vae_unet_to_ppdiffusers(
-                    controlnet_model, converted_ctrl_checkpoint))
+                convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)
+            )
 
             if paddle_dtype is not None:
                 controlnet_model.to(dtype=paddle_dtype)
@@ -1326,7 +1223,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
                 scheduler=scheduler,
                 safety_checker=safety_checker,
                 feature_extractor=feature_extractor,
-                requires_safety_checker=requires_safety_checker, )
+                requires_safety_checker=requires_safety_checker,
+            )
         else:
             pipe = cls(
                 vae=vae,
@@ -1336,12 +1234,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
                 scheduler=scheduler,
                 safety_checker=safety_checker,
                 feature_extractor=feature_extractor,
-                requires_safety_checker=requires_safety_checker, )
+                requires_safety_checker=requires_safety_checker,
+            )
     else:
         text_config = create_ldm_bert_config(original_config)
         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizer.from_pretrained(
-            "bert-base-uncased", model_max_length=77)
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77)
         if paddle_dtype is not None:
             vae.to(dtype=paddle_dtype)
             text_model.to(dtype=paddle_dtype)
@@ -1351,6 +1249,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
             bert=text_model,
             tokenizer=tokenizer,
             unet=unet,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
 
     return pipe
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
index 29d9afb9eef79..5b406410e76aa 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
@@ -26,9 +26,15 @@
 from paddle.distributed.fleet.utils import recompute
 from paddlenlp.transformers.activations import ACT2FN
 from paddlenlp.transformers.clip.configuration import (
-    CLIPConfig, CLIPTextConfig, CLIPVisionConfig)
+    CLIPConfig,
+    CLIPTextConfig,
+    CLIPVisionConfig,
+)
 from paddlenlp.transformers.model_outputs import (
-    BaseModelOutput, BaseModelOutputWithPooling, ModelOutput)
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ModelOutput,
+)
 from paddlenlp.transformers.model_utils import PretrainedModel
 
 from ppdiffusers.initializer import normal_, ones_
@@ -39,7 +45,7 @@
 ]
 
 
-def finfo(dtype: paddle.dtype=None):
+def finfo(dtype: paddle.dtype = None):
     if dtype is None:
         dtype = paddle.get_default_dtype()
 
@@ -58,10 +64,7 @@ class BFloatFInfo:
 
 
 def Parameter(data: paddle.Tensor, requires_grad=True):
-    tensor = paddle.create_parameter(
-        data.shape,
-        dtype=data.dtype,
-        default_initializer=nn.initializer.Assign(data))
+    tensor = paddle.create_parameter(data.shape, dtype=data.dtype, default_initializer=nn.initializer.Assign(data))
     if not requires_grad:
         tensor.stop_gradient = True
     return tensor
@@ -74,13 +77,14 @@ class TorchLinear(nn.Layer):
     """
 
     def __init__(
-            self,
-            in_features,
-            out_features,
-            weight_attr=None,
-            bias_attr=None,
-            name=None,
-            bias=None, ):
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+        bias=None,
+    ):
         super().__init__()
         self._dtype = self._helper.get_default_dtype()
         self._weight_attr = weight_attr
@@ -96,23 +100,25 @@ def __init__(
             ],  # regular linear has shape [in_features, out_features]
             attr=self._weight_attr,
             dtype=self._dtype,
-            is_bias=False, )
+            is_bias=False,
+        )
         self.bias = self.create_parameter(
             shape=[out_features],
             attr=self._bias_attr,
             dtype=self._dtype,
-            is_bias=True, )
+            is_bias=True,
+        )
         self.name = name
 
     def forward(self, input):
-        out = F.linear(
-            x=input, weight=self.weight.T, bias=self.bias, name=self.name)
+        out = F.linear(x=input, weight=self.weight.T, bias=self.bias, name=self.name)
         return out
 
     def extra_repr(self):
         name_str = ", name={}".format(self.name) if self.name else ""
         return "in_features={}, out_features={}, dtype={}{}".format(
-            self.weight.shape[1], self.weight.shape[0], self._dtype, name_str)
+            self.weight.shape[1], self.weight.shape[0], self._dtype, name_str
+        )
 
 
 def str2bool(v):
@@ -139,20 +145,18 @@ def masked_fill(x, mask, value):
     return paddle.where(mask, y, x)
 
 
-def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int]=None):
+def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
     bsz, src_len = mask.shape
     tgt_len = tgt_len if tgt_len is not None else src_len
 
-    expanded_mask = (
-        mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype))
+    expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)
 
     inverted_mask = 1.0 - expanded_mask
 
-    return masked_fill(inverted_mask,
-                       inverted_mask.cast(paddle.bool), finfo(dtype).min)
+    return masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), finfo(dtype).min)
 
 
 # contrastive loss function, adapted from
@@ -256,9 +260,10 @@ class HFCLIPOutput(ModelOutput):
     vision_model_output: BaseModelOutputWithPooling = None
 
     def to_tuple(self) -> Tuple[Any]:
-        return tuple(self[k]
-                     if k not in ["text_model_output", "vision_model_output"]
-                     else getattr(self, k).to_tuple() for k in self.keys())
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
 
 
 class HFCLIPVisionEmbeddings(nn.Layer):
@@ -269,30 +274,29 @@ def __init__(self, config: CLIPVisionConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.class_embedding = Parameter(paddle.randn((self.embed_dim, )))
+        self.class_embedding = Parameter(paddle.randn((self.embed_dim,)))
 
         self.patch_embedding = nn.Conv2D(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
             stride=self.patch_size,
-            bias_attr=False, )
+            bias_attr=False,
+        )
 
-        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.register_buffer(
             "position_ids",
-            paddle.arange(self.num_positions).expand(
-                (1, -1), dtype="int64"),
-            persistable=False, )
+            paddle.arange(self.num_positions).expand((1, -1), dtype="int64"),
+            persistable=False,
+        )
 
     def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(
-            pixel_values.cast(target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values.cast(target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
 
         class_embeds = self.class_embedding.expand([batch_size, 1, -1])
@@ -307,23 +311,22 @@ def __init__(self, config: CLIPTextConfig):
         embed_dim = config.hidden_size
 
         self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings,
-                                               embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.register_buffer(
             "position_ids",
-            paddle.arange(
-                config.max_position_embeddings, dtype="int64").expand((1, -1)),
-            persistable=False, )
+            paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1)),
+            persistable=False,
+        )
 
     def forward(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            inputs_embeds: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
-        seq_length = (input_ids.shape[-1]
-                      if input_ids is not None else inputs_embeds.shape[-2])
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
@@ -349,7 +352,8 @@ def __init__(self, config):
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
+                f" {self.num_heads})."
+            )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
 
@@ -359,18 +363,15 @@ def __init__(self, config):
         self.out_proj = LinearClass(self.embed_dim, self.embed_dim)
 
     def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
-        return tensor.reshape(
-            [bsz, seq_len, self.num_heads, self.head_dim]).transpose(
-                [0, 2, 1, 3])
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            attention_mask: Optional[paddle.Tensor]=None,
-            causal_attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=False, ) -> Tuple[
-                paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[
-                    paddle.Tensor]]]:
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        causal_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
         bsz, tgt_len, embed_dim = hidden_states.shape
@@ -381,8 +382,7 @@ def forward(
         value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len,
-                                   bsz).reshape(proj_shape)
+        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
         key_states = key_states.reshape(proj_shape)
         value_states = value_states.reshape(proj_shape)
 
@@ -392,29 +392,26 @@ def forward(
         if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
             raise ValueError(
                 f"Attention weights should be of size {[bsz * self.num_heads, tgt_len, src_len]}, but is"
-                f" {attn_weights.shape}")
+                f" {attn_weights.shape}"
+            )
 
         # apply the causal_attention_mask first
         if causal_attention_mask is not None:
             if causal_attention_mask.shape != [bsz, 1, tgt_len, src_len]:
                 raise ValueError(
                     f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is"
-                    f" {causal_attention_mask.shape}")
-            attn_weights = (
-                attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) +
-                causal_attention_mask)
-            attn_weights = attn_weights.reshape(
-                [bsz * self.num_heads, tgt_len, src_len])
+                    f" {causal_attention_mask.shape}"
+                )
+            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + causal_attention_mask
+            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
 
         if attention_mask is not None:
             if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
                 raise ValueError(
                     f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}"
                 )
-            attn_weights = (attn_weights.reshape(
-                [bsz, self.num_heads, tgt_len, src_len]) + attention_mask)
-            attn_weights = attn_weights.reshape(
-                [bsz * self.num_heads, tgt_len, src_len])
+            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask
+            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
 
         attn_weights = F.softmax(attn_weights, axis=-1)
 
@@ -423,25 +420,22 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.reshape(
-                [bsz, self.num_heads, tgt_len, src_len])
-            attn_weights = attn_weights_reshaped.reshape(
-                [bsz * self.num_heads, tgt_len, src_len])
+            attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
+            attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len])
         else:
             attn_weights_reshaped = None
 
-        attn_probs = F.dropout(
-            attn_weights, p=self.dropout, training=self.training)
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
 
         attn_output = paddle.matmul(attn_probs, value_states)
 
         if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
             raise ValueError(
                 f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
-                f" {attn_output.shape}")
+                f" {attn_output.shape}"
+            )
 
-        attn_output = attn_output.reshape(
-            [bsz, self.num_heads, tgt_len, self.head_dim])
+        attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
         attn_output = attn_output.transpose([0, 2, 1, 3])
         attn_output = attn_output.reshape([bsz, tgt_len, embed_dim])
 
@@ -470,18 +464,17 @@ def __init__(self, config: CLIPTextConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.self_attn = HFCLIPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(
-            self.embed_dim, epsilon=config.layer_norm_eps)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
         self.mlp = HFCLIPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(
-            self.embed_dim, epsilon=config.layer_norm_eps)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
 
     def forward(
-            self,
-            hidden_states: paddle.Tensor,
-            attention_mask: paddle.Tensor,
-            causal_attention_mask: paddle.Tensor,
-            output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]:
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        causal_attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
         """
         Args:
             hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -499,7 +492,8 @@ def forward(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
-            output_attentions=output_attentions, )
+            output_attentions=output_attentions,
+        )
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -507,10 +501,10 @@ def forward(
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
 
-        outputs = (hidden_states, )
+        outputs = (hidden_states,)
 
         if output_attentions:
-            outputs += (attn_weights, )
+            outputs += (attn_weights,)
 
         return outputs
 
@@ -531,24 +525,21 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor
         if isinstance(module, HFCLIPTextEmbeddings):
             normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
-            normal_(
-                module.position_embedding.weight, mean=0.0, std=factor * 0.02)
+            normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
         elif isinstance(module, HFCLIPVisionEmbeddings):
             factor = self.config.initializer_factor
-            normal_(
-                module.class_embedding,
-                mean=0.0,
-                std=module.embed_dim**-0.5 * factor)
+            normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
             normal_(
                 module.patch_embedding.weight,
-                std=module.config.initializer_range * factor, )
+                std=module.config.initializer_range * factor,
+            )
             normal_(
                 module.position_embedding.weight,
-                std=module.config.initializer_range * factor, )
+                std=module.config.initializer_range * factor,
+            )
         elif isinstance(module, HFCLIPAttention):
             factor = self.config.initializer_factor
-            in_proj_std = ((module.embed_dim**-0.5) * (
-                (2 * module.config.num_hidden_layers)**-0.5) * factor)
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
             out_proj_std = (module.embed_dim**-0.5) * factor
             normal_(module.q_proj.weight, std=in_proj_std)
             normal_(module.k_proj.weight, std=in_proj_std)
@@ -556,30 +547,31 @@ def _init_weights(self, module):
             normal_(module.out_proj.weight, std=out_proj_std)
         elif isinstance(module, HFCLIPMLP):
             factor = self.config.initializer_factor
-            in_proj_std = ((module.config.hidden_size**-0.5) * (
-                (2 * module.config.num_hidden_layers)**-0.5) * factor)
-            fc_std = (2 * module.config.hidden_size)**-0.5 * factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
             normal_(module.fc1.weight, std=fc_std)
             normal_(module.fc2.weight, std=in_proj_std)
         elif isinstance(module, HFCLIPModel):
             normal_(
                 module.text_projection.weight,
-                std=module.text_embed_dim
-                **-0.5 * self.config.initializer_factor, )
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
             normal_(
                 module.visual_projection.weight,
-                std=module.vision_embed_dim
-                **-0.5 * self.config.initializer_factor, )
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
         elif isinstance(module, HFCLIPVisionModelWithProjection):
             normal_(
                 module.visual_projection.weight,
-                std=self.config.hidden_size
-                **-0.5 * self.config.initializer_factor, )
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
         elif isinstance(module, HFCLIPTextModelWithProjection):
             normal_(
                 module.text_projection.weight,
-                std=self.config.hidden_size
-                **-0.5 * self.config.initializer_factor, )
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
 
         if isinstance(module, nn.LayerNorm):
             module.bias.zero_()
@@ -599,9 +591,7 @@ def gradient_checkpointing_enable(self):
         activations".
         """
         if not self.supports_gradient_checkpointing:
-            raise ValueError(
-                f"{self.__class__.__name__} does not support gradient checkpointing."
-            )
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
         self.apply(partial(self._set_gradient_checkpointing, value=True))
 
     def gradient_checkpointing_disable(self):
@@ -627,8 +617,7 @@ def register_load_torch_hook(self, function=None):
             def map_from(module, state_dict, *args, **kwargs):
                 if state_dict.pop("is_torch_weight", False):
                     need_transposed = []
-                    for name, layer in module.named_sublayers(
-                            include_self=True):
+                    for name, layer in module.named_sublayers(include_self=True):
                         if isinstance(layer, nn.Linear):
                             need_transposed.append(name + ".weight")
                     module.need_transposed = need_transposed
@@ -637,8 +626,7 @@ def map_from(module, state_dict, *args, **kwargs):
 
         else:
             map_from = function
-        self.load_torch_hook = self.register_load_state_dict_pre_hook(
-            map_from, with_module=True)
+        self.load_torch_hook = self.register_load_state_dict_pre_hook(map_from, with_module=True)
         return self.load_torch_hook
 
     def remove_load_torch_hook(self):
@@ -651,7 +639,8 @@ def to(self=None, device=None, dtype=None, blocking=None):
             dtype=dtype,
             blocking=blocking,
             include_sublayers=True,
-            floating_only=True, )
+            floating_only=True,
+        )
 
 
 class HFCLIPEncoder(nn.Layer):
@@ -666,20 +655,18 @@ class HFCLIPEncoder(nn.Layer):
     def __init__(self, config: CLIPConfig):
         super().__init__()
         self.config = config
-        self.layers = nn.LayerList([
-            HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.LayerList([HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            inputs_embeds,
-            attention_mask: Optional[paddle.Tensor]=None,
-            causal_attention_mask: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[Tuple,
-                                                         BaseModelOutput]:
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        causal_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
             inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -709,13 +696,11 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -723,7 +708,7 @@ def forward(
         hidden_states = inputs_embeds
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states, )
+                encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
 
                 def create_custom_forward(module):
@@ -736,30 +721,31 @@ def custom_forward(*inputs):
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
-                    causal_attention_mask, )
+                    causal_attention_mask,
+                )
             else:
                 layer_outputs = encoder_layer(
                     hidden_states,
                     attention_mask,
                     causal_attention_mask,
-                    output_attentions=output_attentions, )
+                    output_attentions=output_attentions,
+                )
 
             hidden_states = layer_outputs[0]
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1], )
+                all_attentions = all_attentions + (layer_outputs[1],)
 
         if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states, )
+            encoder_states = encoder_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(
-                v for v in [hidden_states, encoder_states, all_attentions]
-                if v is not None)
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
             last_hidden_state=hidden_states,
             hidden_states=encoder_states,
-            attentions=all_attentions, )
+            attentions=all_attentions,
+        )
 
 
 # def _make_causal_mask(
@@ -786,31 +772,28 @@ def __init__(self, config: CLIPTextConfig):
         embed_dim = config.hidden_size
         self.embeddings = HFCLIPTextEmbeddings(config)
         self.encoder = HFCLIPEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(
-            embed_dim, epsilon=config.layer_norm_eps)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
         # For `pooled_output` computation
         self.eos_token_id = config.eos_token_id
 
     def forward(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, BaseModelOutputWithPooling]:
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
 
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
@@ -818,8 +801,7 @@ def forward(
         input_shape = input_ids.shape
         input_ids = input_ids.reshape([-1, input_shape[-1]])
 
-        hidden_states = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids)
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
         bsz, seq_len = input_shape
         # CLIP's text model uses causal mask, prepare it here.
@@ -828,7 +810,8 @@ def forward(
         causal_attention_mask = self._build_causal_attention_mask(
             bsz,
             seq_len,
-            hidden_states.dtype, )
+            hidden_states.dtype,
+        )
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -840,7 +823,8 @@ def forward(
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
@@ -855,24 +839,24 @@ def forward(
             pooled_output = last_hidden_state.gather_nd(
                 paddle.stack(
                     [
-                        paddle.arange(
-                            last_hidden_state.shape[0], dtype="int32"),
-                        input_ids.argmax(
-                            -1, dtype="int32"),
+                        paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+                        input_ids.argmax(-1, dtype="int32"),
                     ],
-                    axis=-1, ))
+                    axis=-1,
+                )
+            )
         else:
             # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
             # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
             pooled_output = last_hidden_state.gather_nd(
                 paddle.stack(
                     [
-                        paddle.arange(
-                            last_hidden_state.shape[0], dtype="int32"),
-                        (input_ids == self.eos_token_id).cast("int32").argmax(
-                            axis=-1, dtype="int32"),
+                        paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+                        (input_ids == self.eos_token_id).cast("int32").argmax(axis=-1, dtype="int32"),
                     ],
-                    axis=-1, ))
+                    axis=-1,
+                )
+            )
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -881,12 +865,14 @@ def forward(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions, )
+            attentions=encoder_outputs.attentions,
+        )
 
     def _build_causal_attention_mask(self, bsz, seq_len, dtype):
         mask = paddle.triu(
             paddle.full((bsz, 1, seq_len, seq_len), finfo(dtype).min),
-            diagonal=1, )
+            diagonal=1,
+        )
         return mask
 
 
@@ -908,14 +894,14 @@ def set_input_embeddings(self, value):
         self.text_model.embeddings.token_embedding = value
 
     def forward(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, BaseModelOutputWithPooling]:
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
 
@@ -933,8 +919,7 @@ def forward(
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         return self.text_model(
             input_ids=input_ids,
@@ -942,7 +927,8 @@ def forward(
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
 
 class HFCLIPVisionTransformer(nn.Layer):
@@ -952,30 +938,26 @@ def __init__(self, config: CLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = HFCLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(
-            embed_dim, epsilon=config.layer_norm_eps)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
         self.encoder = HFCLIPEncoder(config)
-        self.post_layernorm = nn.LayerNorm(
-            embed_dim, epsilon=config.layer_norm_eps)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
 
     def forward(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, BaseModelOutputWithPooling]:
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
 
         """
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
@@ -987,7 +969,8 @@ def forward(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         last_hidden_state = encoder_outputs[0]
         pooled_output = last_hidden_state[:, 0, :]
@@ -1000,7 +983,8 @@ def forward(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions, )
+            attentions=encoder_outputs.attentions,
+        )
 
 
 class HFCLIPVisionModel(HFCLIPPretrainedModel):
@@ -1017,12 +1001,12 @@ def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
     def forward(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, BaseModelOutputWithPooling]:
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         r"""
         Returns:
 
@@ -1045,14 +1029,14 @@ def forward(
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         return self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
 
 class HFCLIPModel(HFCLIPPretrainedModel):
@@ -1064,12 +1048,14 @@ def __init__(self, config: CLIPConfig):
         if not isinstance(config.text_config, CLIPTextConfig):
             raise ValueError(
                 "config.text_config is expected to be of type CLIPTextConfig but is of type"
-                f" {type(config.text_config)}.")
+                f" {type(config.text_config)}."
+            )
 
         if not isinstance(config.vision_config, CLIPVisionConfig):
             raise ValueError(
                 "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
-                f" {type(config.vision_config)}.")
+                f" {type(config.vision_config)}."
+            )
 
         text_config = config.text_config
         vision_config = config.vision_config
@@ -1081,24 +1067,22 @@ def __init__(self, config: CLIPConfig):
         self.text_model = HFCLIPTextTransformer(text_config)
         self.vision_model = HFCLIPVisionTransformer(vision_config)
 
-        self.visual_projection = LinearClass(
-            self.vision_embed_dim, self.projection_dim, bias_attr=False)
-        self.text_projection = LinearClass(
-            self.text_embed_dim, self.projection_dim, bias_attr=False)
-        self.logit_scale = Parameter(
-            paddle.to_tensor(self.config.logit_scale_init_value))
+        self.visual_projection = LinearClass(self.vision_embed_dim, self.projection_dim, bias_attr=False)
+        self.text_projection = LinearClass(self.text_embed_dim, self.projection_dim, bias_attr=False)
+        self.logit_scale = Parameter(paddle.to_tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_text_features(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> paddle.Tensor:
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
         r"""
         Returns:
             text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
@@ -1116,13 +1100,11 @@ def get_text_features(
         >>> text_features = model.get_text_features(**inputs)
         ```"""
         # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_outputs = self.text_model(
             input_ids=input_ids,
@@ -1130,7 +1112,8 @@ def get_text_features(
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         pooled_output = text_outputs[1]
         text_features = self.text_projection(pooled_output)
@@ -1138,11 +1121,12 @@ def get_text_features(
         return text_features
 
     def get_image_features(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> paddle.Tensor:
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
         r"""
         Returns:
             image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
@@ -1166,19 +1150,18 @@ def get_image_features(
         >>> image_features = model.get_image_features(**inputs)
         ```"""
         # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         pooled_output = vision_outputs[1]  # pooled_output
         image_features = self.visual_projection(pooled_output)
@@ -1186,15 +1169,16 @@ def get_image_features(
         return image_features
 
     def forward(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            pixel_values: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            return_loss: Optional[bool]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[Tuple, HFCLIPOutput]:
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HFCLIPOutput]:
         r"""
         Returns:
 
@@ -1221,19 +1205,18 @@ def forward(
         >>> probs = F.softmax(logits_per_image.softmax, axis=1)  # we can take the softmax to get the label probabilities
         ```"""
         # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         text_outputs = self.text_model(
             input_ids=input_ids,
@@ -1241,7 +1224,8 @@ def forward(
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         image_embeds = vision_outputs[1]
         image_embeds = self.visual_projection(image_embeds)
@@ -1250,14 +1234,12 @@ def forward(
         text_embeds = self.text_projection(text_embeds)
 
         # normalized features
-        image_embeds = image_embeds / image_embeds.norm(
-            p=2, axis=-1, keepdim=True)
+        image_embeds = image_embeds / image_embeds.norm(p=2, axis=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True)
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
-        logits_per_text = paddle.matmul(text_embeds,
-                                        image_embeds.t()) * logit_scale
+        logits_per_text = paddle.matmul(text_embeds, image_embeds.t()) * logit_scale
         logits_per_image = logits_per_text.t()
 
         loss = None
@@ -1271,8 +1253,9 @@ def forward(
                 text_embeds,
                 image_embeds,
                 text_outputs,
-                vision_outputs, )
-            return ((loss, ) + output) if loss is not None else output
+                vision_outputs,
+            )
+            return ((loss,) + output) if loss is not None else output
 
         return HFCLIPOutput(
             loss=loss,
@@ -1281,7 +1264,8 @@ def forward(
             text_embeds=text_embeds,
             image_embeds=image_embeds,
             text_model_output=text_outputs,
-            vision_model_output=vision_outputs, )
+            vision_model_output=vision_outputs,
+        )
 
 
 class HFCLIPTextModelWithProjection(HFCLIPPretrainedModel):
@@ -1294,8 +1278,7 @@ def __init__(self, config: CLIPTextConfig):
 
         self.text_model = HFCLIPTextTransformer(config)
 
-        self.text_projection = LinearClass(
-            config.hidden_size, config.projection_dim, bias_attr=False)
+        self.text_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1307,14 +1290,14 @@ def set_input_embeddings(self, value):
         self.text_model.embeddings.token_embedding = value
 
     def forward(
-            self,
-            input_ids: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            position_ids: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[Tuple,
-                                                         HFCLIPTextModelOutput]:
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HFCLIPTextModelOutput]:
         r"""
         Returns:
 
@@ -1331,8 +1314,7 @@ def forward(
         >>> outputs = model(**inputs)
         >>> text_embeds = outputs.text_embeds
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         text_outputs = self.text_model(
             input_ids=input_ids,
@@ -1340,7 +1322,8 @@ def forward(
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         pooled_output = text_outputs[1]
 
@@ -1354,7 +1337,8 @@ def forward(
             text_embeds=text_embeds,
             last_hidden_state=text_outputs.last_hidden_state,
             hidden_states=text_outputs.hidden_states,
-            attentions=text_outputs.attentions, )
+            attentions=text_outputs.attentions,
+        )
 
 
 class HFCLIPVisionModelWithProjection(HFCLIPPretrainedModel):
@@ -1366,8 +1350,7 @@ def __init__(self, config: CLIPVisionConfig):
 
         self.vision_model = HFCLIPVisionTransformer(config)
 
-        self.visual_projection = LinearClass(
-            config.hidden_size, config.projection_dim, bias_attr=False)
+        self.visual_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1376,12 +1359,12 @@ def get_input_embeddings(self) -> nn.Layer:
         return self.vision_model.embeddings.patch_embedding
 
     def forward(
-            self,
-            pixel_values: Optional[paddle.Tensor]=None,
-            output_attentions: Optional[bool]=None,
-            output_hidden_states: Optional[bool]=None,
-            return_dict: Optional[bool]=None, ) -> Union[
-                Tuple, HFCLIPVisionModelOutput]:
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, HFCLIPVisionModelOutput]:
         r"""
         Returns:
 
@@ -1403,14 +1386,14 @@ def forward(
         >>> outputs = model(**inputs)
         >>> image_embeds = outputs.image_embeds
         ```"""
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict, )
+            return_dict=return_dict,
+        )
 
         pooled_output = vision_outputs[1]  # pooled_output
 
@@ -1424,4 +1407,5 @@ def forward(
             image_embeds=image_embeds,
             last_hidden_state=vision_outputs.last_hidden_state,
             hidden_states=vision_outputs.hidden_states,
-            attentions=vision_outputs.attentions, )
+            attentions=vision_outputs.attentions,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index c74cfb57a53b3..80cf9f98c1082 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -20,8 +20,7 @@
 import paddle
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...loaders import TextualInversionLoaderMixin
@@ -46,11 +45,7 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -61,50 +56,46 @@ def preprocess(image):
     return image
 
 
-def posterior_sample(scheduler, latents, timestep, clean_latents, generator,
-                     eta):
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
     # 1. get previous step value (=t-1)
-    prev_timestep = (timestep - scheduler.config.num_train_timesteps //
-                     scheduler.num_inference_steps)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
 
     if prev_timestep <= 0:
         return clean_latents
 
     # 2. compute alphas, betas
     alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
-                         prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
 
     variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance**(0.5)
+    std_dev_t = eta * variance ** (0.5)
 
     # direction pointing to x_t
-    e_t = (latents - alpha_prod_t**
-           (0.5) * clean_latents) / (1 - alpha_prod_t)**(0.5)
-    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2)**(0.5) * e_t
-    noise = std_dev_t * randn_tensor(
-        clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
-    prev_latents = alpha_prod_t_prev**(0.5) * clean_latents + dir_xt + noise
+    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+    noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
+    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
 
     return prev_latents
 
 
 def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
     # 1. get previous step value (=t-1)
-    prev_timestep = (timestep - scheduler.config.num_train_timesteps //
-                     scheduler.num_inference_steps)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
 
     # 2. compute alphas, betas
     alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
-                         prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
 
     beta_prod_t = 1 - alpha_prod_t
 
     # 3. compute predicted original sample from predicted noise also called
     # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_original_sample = (latents - beta_prod_t**
-                            (0.5) * noise_pred) / alpha_prod_t**(0.5)
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
 
     # 4. Clip "predicted x_0"
     if scheduler.config.clip_sample:
@@ -113,16 +104,14 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
     # 5. compute variance: "sigma_t(η)" -> see formula (16)
     # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
     variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance**(0.5)
+    std_dev_t = eta * variance ** (0.5)
 
     # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
-        0.5) * noise_pred
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
 
-    noise = (prev_latents -
-             (alpha_prod_t_prev**
-              (0.5) * pred_original_sample + pred_sample_direction)) / (
-                  variance**(0.5) * eta)
+    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+        variance ** (0.5) * eta
+    )
     return noise
 
 
@@ -156,31 +145,28 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: DDIMScheduler,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
@@ -200,12 +186,10 @@ def __init__(
                 f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -216,12 +200,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -233,18 +214,20 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -284,29 +267,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -314,8 +299,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -325,21 +309,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -347,71 +332,67 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            strength,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -424,7 +405,8 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -433,15 +415,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -449,11 +429,10 @@ def prepare_extra_step_kwargs(self, generator, eta):
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -470,21 +449,14 @@ def decode_latents(self, latents):
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        num_images_per_prompt,
-                        dtype,
-                        generator=None):
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         image = image.cast(dtype)
 
         batch_size = image.shape[0]
@@ -496,8 +468,7 @@ def prepare_latents(self,
 
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(init_latents, axis=0)
         else:
@@ -505,8 +476,7 @@ def prepare_latents(self,
 
         init_latents = self.vae.config.scaling_factor * init_latents
 
-        if (batch_size > init_latents.shape[0] and
-                batch_size % init_latents.shape[0] == 0):
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
             deprecation_message = (
                 f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -518,20 +488,19 @@ def prepare_latents(self,
                 "len(prompt) != len(image)",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             additional_image_per_prompt = batch_size // init_latents.shape[0]
             init_latents = paddle.concat(
-                [init_latents] * additional_image_per_prompt *
-                num_images_per_prompt,
-                axis=0, )
-        elif (batch_size > init_latents.shape[0] and
-              batch_size % init_latents.shape[0] != 0):
+                [init_latents] * additional_image_per_prompt * num_images_per_prompt,
+                axis=0,
+            )
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             )
         else:
-            init_latents = paddle.concat(
-                [init_latents] * num_images_per_prompt, axis=0)
+            init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0)
 
         # add noise to latents using the timestep
         shape = init_latents.shape
@@ -546,25 +515,25 @@ def prepare_latents(self,
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            source_prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[paddle.Tensor]=None,
-            source_guidance_scale: Optional[float]=1,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.1,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[paddle.Tensor] = None,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -639,7 +608,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -661,20 +631,19 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
         source_prompt_embeds = self._encode_prompt(
-            source_prompt, num_images_per_prompt, do_classifier_free_guidance,
-            None)
+            source_prompt, num_images_per_prompt, do_classifier_free_guidance, None
+        )
 
         # 4. Preprocess image
         image = preprocess(image)
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
         latents, clean_latents = self.prepare_latents(
@@ -683,7 +652,8 @@ def __call__(
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
         source_latents = latents
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -691,17 +661,14 @@ def __call__(
         generator = extra_step_kwargs.pop("generator", None)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = paddle.concat([latents] * 2)
                 source_latent_model_input = paddle.concat([source_latents] * 2)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                source_latent_model_input = self.scheduler.scale_model_input(
-                    source_latent_model_input, t)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
 
                 # predict the noise residual
                 concat_latent_model_input = paddle.stack(
@@ -711,7 +678,8 @@ def __call__(
                         source_latent_model_input[1],
                         latent_model_input[1],
                     ],
-                    axis=0, )
+                    axis=0,
+                )
                 concat_prompt_embeds = paddle.stack(
                     [
                         source_prompt_embeds[0],
@@ -719,23 +687,25 @@ def __call__(
                         source_prompt_embeds[1],
                         prompt_embeds[1],
                     ],
-                    axis=0, )
+                    axis=0,
+                )
                 concat_noise_pred = self.unet(
                     concat_latent_model_input,
                     t,
-                    encoder_hidden_states=concat_prompt_embeds, ).sample
+                    encoder_hidden_states=concat_prompt_embeds,
+                ).sample
 
                 # perform guidance
                 (
                     source_noise_pred_uncond,
                     noise_pred_uncond,
                     source_noise_pred_text,
-                    noise_pred_text, ) = concat_noise_pred.chunk(
-                        4, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                    noise_pred_text,
+                ) = concat_noise_pred.chunk(4, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
-                    source_noise_pred_text - source_noise_pred_uncond)
+                    source_noise_pred_text - source_noise_pred_uncond
+                )
 
                 # Sample source_latents from the posterior distribution.
                 prev_source_latents = posterior_sample(
@@ -744,7 +714,8 @@ def __call__(
                     t,
                     clean_latents,
                     generator=generator,
-                    **extra_step_kwargs, )
+                    **extra_step_kwargs,
+                )
                 # Compute noise.
                 noise = compute_noise(
                     self.scheduler,
@@ -752,21 +723,17 @@ def __call__(
                     source_latents,
                     t,
                     source_noise_pred,
-                    **extra_step_kwargs, )
+                    **extra_step_kwargs,
+                )
                 source_latents = prev_source_latents
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    variance_noise=noise,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+                ).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -775,8 +742,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 11. Convert to PIL
         if output_type == "pil":
@@ -785,5 +751,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
index 33a4cd8838fe2..31fc2eb7d9db6 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
@@ -22,57 +22,52 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler
 from ...utils import logging, randn_tensor
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
 from . import StableDiffusionPipelineOutput
 
 logger = logging.get_logger(__name__)
 
 
-def posterior_sample(scheduler, latents, timestep, clean_latents, generator,
-                     eta):
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
     # 1. get previous step value (=t-1)
-    prev_timestep = (timestep - scheduler.config.num_train_timesteps //
-                     scheduler.num_inference_steps)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
 
     if prev_timestep <= 0:
         return clean_latents
 
     # 2. compute alphas, betas
     alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
-                         prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
 
     variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance**(0.5)
+    std_dev_t = eta * variance ** (0.5)
 
     # direction pointing to x_t
-    e_t = (latents - alpha_prod_t**
-           (0.5) * clean_latents) / (1 - alpha_prod_t)**(0.5)
-    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2)**(0.5) * e_t
-    noise = std_dev_t * randn_tensor(
-        clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
-    prev_latents = alpha_prod_t_prev**(0.5) * clean_latents + dir_xt + noise
+    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+    noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
+    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
 
     return prev_latents
 
 
 def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
     # 1. get previous step value (=t-1)
-    prev_timestep = (timestep - scheduler.config.num_train_timesteps //
-                     scheduler.num_inference_steps)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
 
     # 2. compute alphas, betas
     alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if
-                         prev_timestep >= 0 else scheduler.final_alpha_cumprod)
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
 
     beta_prod_t = 1 - alpha_prod_t
 
     # 3. compute predicted original sample from predicted noise also called
     # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_original_sample = (latents - beta_prod_t**
-                            (0.5) * noise_pred) / alpha_prod_t**(0.5)
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
 
     # 4. Clip "predicted x_0"
     if scheduler.config.clip_sample:
@@ -81,21 +76,18 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
     # 5. compute variance: "sigma_t(η)" -> see formula (16)
     # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
     variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance**(0.5)
+    std_dev_t = eta * variance ** (0.5)
 
     # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
-        0.5) * noise_pred
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
 
-    noise = (prev_latents -
-             (alpha_prod_t_prev**
-              (0.5) * pred_original_sample + pred_sample_direction)) / (
-                  variance**(0.5) * eta)
+    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+        variance ** (0.5) * eta
+    )
     return noise
 
 
-class FastDeployCycleDiffusionPipeline(DiffusionPipeline,
-                                       FastDeployDiffusionPipelineMixin):
+class FastDeployCycleDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.
 
@@ -125,16 +117,17 @@ class FastDeployCycleDiffusionPipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: DDIMScheduler,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: DDIMScheduler,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -159,37 +152,38 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
         self.change_scheduler("ddim")
 
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            source_prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[paddle.Tensor]=None,
-            source_guidance_scale: Optional[float]=1,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.1,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[paddle.Tensor] = None,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -266,8 +260,7 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Preprocess image
-        init_image = self.image_processor.preprocess(
-            image, height=height, width=width)
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs
@@ -279,7 +272,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
 
         # 2. Define call parameters
@@ -305,23 +299,23 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
         source_prompt_embeds = self._encode_prompt(
             source_prompt,
             num_images_per_prompt,
             do_classifier_free_guidance,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         # 6. Prepare latent variables
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         is_strength_max = strength == 1.0
         latents, clean_latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -333,7 +327,8 @@ def __call__(
             timestep=latent_timestep,
             is_strength_max=is_strength_max,
             return_image_latents=True,
-            infer_op=infer_op_dict.get("vae_encoder", None), )
+            infer_op=infer_op_dict.get("vae_encoder", None),
+        )
         source_latents = latents
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -341,18 +336,15 @@ def __call__(
         generator = extra_step_kwargs.pop("generator", None)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = paddle.concat([latents] * 2)
                 source_latent_model_input = paddle.concat([source_latents] * 2)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                source_latent_model_input = self.scheduler.scale_model_input(
-                    source_latent_model_input, t)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
 
                 # predict the noise residual
                 concat_latent_model_input = paddle.stack(
@@ -362,7 +354,8 @@ def __call__(
                         source_latent_model_input[1],
                         latent_model_input[1],
                     ],
-                    axis=0, )
+                    axis=0,
+                )
                 concat_prompt_embeds = paddle.stack(
                     [
                         source_prompt_embeds[0],
@@ -370,14 +363,16 @@ def __call__(
                         source_prompt_embeds[1],
                         prompt_embeds[1],
                     ],
-                    axis=0, )
+                    axis=0,
+                )
 
                 unet_inputs = dict(
                     sample=concat_latent_model_input,
                     timestep=t,
                     encoder_hidden_states=concat_prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=concat_latent_model_input.shape, )
+                    output_shape=concat_latent_model_input.shape,
+                )
                 # predict the noise residual
                 concat_noise_pred = self.unet(**unet_inputs)[0]
 
@@ -386,12 +381,12 @@ def __call__(
                     source_noise_pred_uncond,
                     noise_pred_uncond,
                     source_noise_pred_text,
-                    noise_pred_text, ) = concat_noise_pred.chunk(
-                        4, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                    noise_pred_text,
+                ) = concat_noise_pred.chunk(4, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
-                    source_noise_pred_text - source_noise_pred_uncond)
+                    source_noise_pred_text - source_noise_pred_uncond
+                )
 
                 # Sample source_latents from the posterior distribution.
                 prev_source_latents = posterior_sample(
@@ -400,7 +395,8 @@ def __call__(
                     t,
                     clean_latents,
                     generator=generator,
-                    **extra_step_kwargs, )
+                    **extra_step_kwargs,
+                )
                 # Compute noise.
                 noise = compute_noise(
                     self.scheduler,
@@ -408,20 +404,16 @@ def __call__(
                     source_latents,
                     t,
                     source_noise_pred,
-                    **extra_step_kwargs, )
+                    **extra_step_kwargs,
+                )
                 source_latents = prev_source_latents
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    variance_noise=noise,
-                    **extra_step_kwargs).prev_sample
+                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+                ).prev_sample
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -432,7 +424,8 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
             image, has_nsfw_concept = self.run_safety_checker(image)
         else:
             image = latents
@@ -443,11 +436,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
index 61110d7638d0f..8de1b7b464dfb 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
@@ -22,15 +22,13 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
 from . import StableDiffusionPipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class FastDeployStableDiffusionPipeline(DiffusionPipeline,
-                                        FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
 
@@ -60,21 +58,20 @@ class FastDeployStableDiffusionPipeline(DiffusionPipeline,
         feature_extractor ([`CLIPImageProcessor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
-    _optional_components = [
-        "vae_encoder", "safety_checker", "feature_extractor"
-    ]
+    _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -99,34 +96,35 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
 
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -200,7 +198,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
 
         # 2. Define call parameters
@@ -226,7 +225,8 @@ def __call__(
                 height=height,
                 batch_size=batch_size,
                 num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance, )
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
 
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
@@ -238,7 +238,8 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -250,47 +251,42 @@ def __call__(
             height,
             width,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 unet_inputs = dict(
                     sample=latent_model_input,
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape, )
+                    output_shape=latent_model_input.shape,
+                )
                 if do_controlnet:
                     unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs[
-                        "controlnet_conditioning_scale"] = control_conditioning_scale
+                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
                 # predict the noise residual
                 noise_pred_unet = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
                 # compute the previous noisy sample x_t -> x_t-1
@@ -301,15 +297,13 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -320,7 +314,8 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
             image, has_nsfw_concept = self.run_safety_checker(image)
         else:
             image = latents
@@ -331,11 +326,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
index 6d1b14edfaa32..324d66f3e0187 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
@@ -13,16 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .pipeline_fastdeploy_stable_diffusion import \
-    FastDeployStableDiffusionPipeline
+from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
 
 
-class FastDeployStableDiffusionControlNetPipeline(
-        FastDeployStableDiffusionPipeline):
+class FastDeployStableDiffusionControlNetPipeline(FastDeployStableDiffusionPipeline):
     def __call__(
-            self,
-            *args,
-            **kwargs, ):
+        self,
+        *args,
+        **kwargs,
+    ):
         controlnet_cond = kwargs.pop("controlnet_cond", None)
         image = kwargs.pop("image", None)
         if controlnet_cond is None:
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
index 7f92020a9d9dc..b90541cfd23a1 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
@@ -20,7 +20,9 @@
 from paddlenlp.transformers import CLIPImageProcessor
 
 from ppdiffusers.pipelines.fastdeploy_utils import (
-    FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel)
+    FastDeployDiffusionPipelineMixin,
+    FastDeployRuntimeModel,
+)
 
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging
@@ -30,8 +32,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class FastDeployStableDiffusionImageVariationPipeline(
-        DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionImageVariationPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline to generate variations from an input image using Stable Diffusion.
 
@@ -59,15 +60,16 @@ class FastDeployStableDiffusionImageVariationPipeline(
     _optional_components = ["safety_checker"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            image_encoder: FastDeployRuntimeModel,
-            unet: FastDeployRuntimeModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        image_encoder: FastDeployRuntimeModel,
+        unet: FastDeployRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -93,28 +95,27 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
 
-    def _encode_image(self, image, num_images_per_prompt,
-                      do_classifier_free_guidance, infer_op_dict):
+    def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict):
         if not isinstance(image, paddle.Tensor):
-            image = self.feature_extractor(
-                images=image, return_tensors="pd").pixel_values
+            image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
 
         image_encoder_inputs = dict(
             pixel_values=image,
             infer_op=infer_op_dict.get("image_encoder", None),
-            output_shape=[image.shape[0], 768], )
+            output_shape=[image.shape[0], 768],
+        )
         image_embeddings = self.image_encoder(**image_encoder_inputs)[0]
         image_embeddings = image_embeddings.unsqueeze(1)
 
         # duplicate image embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = image_embeddings.shape
         image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             negative_prompt_embeds = paddle.zeros_like(image_embeddings)
@@ -122,49 +123,50 @@ def _encode_image(self, image, num_images_per_prompt,
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            image_embeddings = paddle.concat(
-                [negative_prompt_embeds, image_embeddings])
+            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
 
         return image_embeddings
 
     def check_inputs(self, image, height, width, callback_steps):
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}")
+                f" {type(image)}"
+            )
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     @paddle.no_grad()
     def __call__(
-            self,
-            image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -242,9 +244,7 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input image
-        image_embeddings = self._encode_image(image, num_images_per_prompt,
-                                              do_classifier_free_guidance,
-                                              infer_op_dict)
+        image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict)
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -256,26 +256,23 @@ def __call__(
             height,
             width,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 unet_inputs = dict(
@@ -283,14 +280,14 @@ def __call__(
                     timestep=t,
                     encoder_hidden_states=image_embeddings,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape, )
+                    output_shape=latent_model_input.shape,
+                )
                 noise_pred = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 if is_scheduler_support_step_index:
@@ -300,16 +297,14 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -320,7 +315,8 @@ def __call__(
         # 8. Post-processing
         image = self._decode_vae_latents(
             latents / self.vae_scaling_factor,
-            infer_op=infer_op_dict.get("vae_decoder", None), )
+            infer_op=infer_op_dict.get("vae_decoder", None),
+        )
 
         # 9. Run safety checker
         image, has_nsfw_concept = self.run_safety_checker(image)
@@ -330,11 +326,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
index c282d47747dec..49f736a9c71c5 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
@@ -22,15 +22,13 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
 from . import StableDiffusionPipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class FastDeployStableDiffusionImg2ImgPipeline(
-        DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionImg2ImgPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline for text-guided image-to-image generation using Stable Diffusion.
 
@@ -63,16 +61,17 @@ class FastDeployStableDiffusionImg2ImgPipeline(
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -97,36 +96,37 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
 
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -199,8 +199,7 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Preprocess image
-        init_image = self.image_processor.preprocess(
-            image, height=height, width=width)
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs. Raise error if not correct
@@ -212,7 +211,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
 
         # 2. Define call parameters
@@ -238,7 +238,8 @@ def __call__(
                 height=height,
                 batch_size=batch_size,
                 num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance, )
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
 
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
@@ -250,17 +251,16 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
 
         # 5. Prepare latent variables
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
         latents = self.prepare_latents(
@@ -272,47 +272,42 @@ def __call__(
             image=init_image,
             timestep=latent_timestep,
             is_strength_max=is_strength_max,
-            infer_op=infer_op_dict.get("vae_encoder", None), )
+            infer_op=infer_op_dict.get("vae_encoder", None),
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 unet_inputs = dict(
                     sample=latent_model_input,
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape, )
+                    output_shape=latent_model_input.shape,
+                )
                 if do_controlnet:
                     unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs[
-                        "controlnet_conditioning_scale"] = control_conditioning_scale
+                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
                 # predict the noise residual
                 noise_pred_unet = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
 
@@ -324,16 +319,14 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -344,7 +337,8 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
             image, has_nsfw_concept = self.run_safety_checker(image)
         else:
             image = latents
@@ -355,11 +349,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
index 4fdbacaaf890a..2ae694a4f8e2f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
@@ -23,18 +23,13 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import PIL_INTERPOLATION, logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
 from . import StableDiffusionPipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def prepare_mask_and_masked_image(image,
-                                  mask,
-                                  height=None,
-                                  width=None,
-                                  return_image: bool=False):
+def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
     """
     Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
     converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -71,14 +66,11 @@ def prepare_mask_and_masked_image(image,
 
     if isinstance(image, paddle.Tensor):
         if not isinstance(mask, paddle.Tensor):
-            raise TypeError(
-                f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
-            )
+            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
 
         # Batch single image
         if image.ndim == 3:
-            assert (image.shape[0] == 3
-                    ), "Image outside a batch should be of shape (3, H, W)"
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
             image = image.unsqueeze(0)
 
         # Batch and add channel dim for single mask
@@ -95,12 +87,9 @@ def prepare_mask_and_masked_image(image,
             else:
                 mask = mask.unsqueeze(1)
 
-        assert (image.ndim == 4 and
-                mask.ndim == 4), "Image and Mask must have 4 dimensions"
-        assert (image.shape[-2:] == mask.shape[-2:]
-                ), "Image and Mask must have the same spatial dimensions"
-        assert (image.shape[0] == mask.shape[0]
-                ), "Image and Mask must have the same batch size"
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
 
         # Check image is in [-1, 1]
         if image.min() < -1 or image.max() > 1:
@@ -117,8 +106,7 @@ def prepare_mask_and_masked_image(image,
         # Image as float32
         image = image.cast(dtype=paddle.float32)
     elif isinstance(mask, paddle.Tensor):
-        raise TypeError(
-            f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
     else:
         # preprocess image
         if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -129,13 +117,8 @@ def prepare_mask_and_masked_image(image,
                 w, h = image[0].size
             else:
                 w, h = width, height
-            w, h = (x - x % 8
-                    for x in (w, h))  # resize to integer multiple of 8
-            image = [
-                i.resize(
-                    (w, h), resample=PIL_INTERPOLATION["lanczos"])
-                for i in image
-            ]
+            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+            image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
             image = [np.array(i.convert("RGB"))[None, :] for i in image]
             image = np.concatenate(image, axis=0)
         elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -154,14 +137,9 @@ def prepare_mask_and_masked_image(image,
                 w, h = mask[0].size
             else:
                 w, h = width, height
-            w, h = (x - x % 8
-                    for x in (w, h))  # resize to integer multiple of 8
-            mask = [
-                i.resize(
-                    (w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask
-            ]
-            mask = np.concatenate(
-                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+            mask = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
             mask = mask.astype(np.float32) / 255.0
         elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
             mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -179,8 +157,7 @@ def prepare_mask_and_masked_image(image,
     return mask, masked_image
 
 
-class FastDeployStableDiffusionInpaintPipeline(
-        DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionInpaintPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline for text-guided image inpainting using Stable Diffusion.
 
@@ -213,16 +190,17 @@ class FastDeployStableDiffusionInpaintPipeline(
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -247,38 +225,39 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
 
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: int=None,
-            width: int=None,
-            strength: float=1.0,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            add_predicted_noise: Optional[bool]=False,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: int = None,
+        width: int = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -363,7 +342,8 @@ def __call__(
             mask_image,
             height,
             width,
-            return_image=True, )
+            return_image=True,
+        )
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs
@@ -375,7 +355,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
 
         # 2. Define call parameters
@@ -401,15 +382,14 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
 
@@ -429,7 +409,8 @@ def __call__(
             is_strength_max=is_strength_max,
             return_noise=True,
             return_image_latents=return_image_latents,
-            infer_op=infer_op_dict.get("vae_encoder", None), )
+            infer_op=infer_op_dict.get("vae_encoder", None),
+        )
 
         if return_image_latents:
             latents, noise, image_latents = latents_outputs
@@ -445,24 +426,23 @@ def __call__(
             width,
             do_classifier_free_guidance,
             return_masked_image_latents=True,
-            infer_op=infer_op_dict.get("vae_encoder", None), )
+            infer_op=infer_op_dict.get("vae_encoder", None),
+        )
 
         # 7. Check that sizes of mask, masked image and latents match
         if num_channels_unet == 9:
             # default case for runwayml/stable-diffusion-inpainting
             num_channels_mask = mask.shape[1]
             num_channels_masked_image = masked_image_latents.shape[1]
-            if (num_channels_latents + num_channels_mask +
-                    num_channels_masked_image != num_channels_unet):
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
                 raise ValueError(
                     f"Incorrect configuration settings! Received `num_channels_latents`: {num_channels_latents} +"
                     f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
                     f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input.")
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
         elif num_channels_unet != 4:
-            raise ValueError(
-                f"The unet should have either 4 or 9 input channels, not {num_channels_unet}."
-            )
+            raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.")
         # do_controlnet
         do_controlnet = controlnet_cond is not None and num_channels_unet == 4
         if do_controlnet:
@@ -473,59 +453,52 @@ def __call__(
                 height=height,
                 batch_size=batch_size,
                 num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance, )
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
 
         # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         if do_classifier_free_guidance:
-            init_mask = mask[:mask.shape[0] // 2]
+            init_mask = mask[: mask.shape[0] // 2]
         else:
             init_mask = mask
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 output_shape = latent_model_input.shape
                 if not is_legacy:
                     # concat latents, mask, masked_image_latents in the channel dimension
-                    latent_model_input = paddle.concat(
-                        [latent_model_input, mask, masked_image_latents],
-                        axis=1)
+                    latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
 
                 unet_inputs = dict(
                     sample=latent_model_input,
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=output_shape, )
+                    output_shape=output_shape,
+                )
                 if do_controlnet:
                     unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs[
-                        "controlnet_conditioning_scale"] = control_conditioning_scale
+                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
                 # predict the noise residual
                 noise_pred_unet = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
 
@@ -537,32 +510,27 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
 
                 if is_legacy:
                     if i < len(timesteps) - 1:
                         # masking
                         if add_predicted_noise:
-                            init_latents_proper = self.scheduler.add_noise(
-                                image_latents, noise_pred_uncond, t)
+                            init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
                         else:
                             # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
                             noise_timestep = timesteps[i + 1]
-                            init_latents_proper = self.scheduler.add_noise(
-                                image_latents, noise, noise_timestep)
+                            init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
                     else:
                         init_latents_proper = image_latents
-                    latents = (1 - init_mask
-                               ) * init_latents_proper + init_mask * latents
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -573,7 +541,8 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
             image, has_nsfw_concept = self.run_safety_checker(image)
         else:
             image = latents
@@ -584,11 +553,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
index 59c3a5bd12dec..7d2c1d82e5651 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
@@ -23,18 +23,13 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import PIL_INTERPOLATION, logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
 from . import StableDiffusionPipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def prepare_mask_and_masked_image(image,
-                                  mask,
-                                  height=None,
-                                  width=None,
-                                  return_image: bool=False):
+def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
     """
     Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
     converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -71,14 +66,11 @@ def prepare_mask_and_masked_image(image,
 
     if isinstance(image, paddle.Tensor):
         if not isinstance(mask, paddle.Tensor):
-            raise TypeError(
-                f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
-            )
+            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
 
         # Batch single image
         if image.ndim == 3:
-            assert (image.shape[0] == 3
-                    ), "Image outside a batch should be of shape (3, H, W)"
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
             image = image.unsqueeze(0)
 
         # Batch and add channel dim for single mask
@@ -95,12 +87,9 @@ def prepare_mask_and_masked_image(image,
             else:
                 mask = mask.unsqueeze(1)
 
-        assert (image.ndim == 4 and
-                mask.ndim == 4), "Image and Mask must have 4 dimensions"
-        assert (image.shape[-2:] == mask.shape[-2:]
-                ), "Image and Mask must have the same spatial dimensions"
-        assert (image.shape[0] == mask.shape[0]
-                ), "Image and Mask must have the same batch size"
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
 
         # Check image is in [-1, 1]
         if image.min() < -1 or image.max() > 1:
@@ -117,8 +106,7 @@ def prepare_mask_and_masked_image(image,
         # Image as float32
         image = image.cast(dtype=paddle.float32)
     elif isinstance(mask, paddle.Tensor):
-        raise TypeError(
-            f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
     else:
         # preprocess image
         if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -129,13 +117,8 @@ def prepare_mask_and_masked_image(image,
                 w, h = image[0].size
             else:
                 w, h = width, height
-            w, h = (x - x % 8
-                    for x in (w, h))  # resize to integer multiple of 8
-            image = [
-                i.resize(
-                    (w, h), resample=PIL_INTERPOLATION["lanczos"])
-                for i in image
-            ]
+            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+            image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
             image = [np.array(i.convert("RGB"))[None, :] for i in image]
             image = np.concatenate(image, axis=0)
         elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -154,14 +137,9 @@ def prepare_mask_and_masked_image(image,
                 w, h = mask[0].size
             else:
                 w, h = width, height
-            w, h = (x - x % 8
-                    for x in (w, h))  # resize to integer multiple of 8
-            mask = [
-                i.resize(
-                    (w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask
-            ]
-            mask = np.concatenate(
-                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+            mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
             mask = mask.astype(np.float32) / 255.0
         elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
             mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -179,8 +157,7 @@ def prepare_mask_and_masked_image(image,
     return mask, masked_image
 
 
-class FastDeployStableDiffusionInpaintPipelineLegacy(
-        DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionInpaintPipelineLegacy(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     r"""
     Pipeline for text-guided image inpainting legacy using Stable Diffusion.
 
@@ -213,16 +190,17 @@ class FastDeployStableDiffusionInpaintPipelineLegacy(
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae_encoder: FastDeployRuntimeModel,
-            vae_decoder: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: CLIPTokenizer,
-            unet: FastDeployRuntimeModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: FastDeployRuntimeModel,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=False, ):
+        self,
+        vae_encoder: FastDeployRuntimeModel,
+        vae_decoder: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: FastDeployRuntimeModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: FastDeployRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -247,38 +225,39 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
         self.post_init()
 
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: int=None,
-            width: int=None,
-            strength: float=1.0,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            add_predicted_noise: Optional[bool]=False,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: int = None,
+        width: int = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -363,7 +342,8 @@ def __call__(
             mask_image,
             height,
             width,
-            return_image=True, )
+            return_image=True,
+        )
         height, width = init_image.shape[-2:]
 
         # 1. Check inputs
@@ -375,7 +355,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            strength, )
+            strength,
+        )
         infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -400,7 +381,8 @@ def __call__(
                 height=height,
                 batch_size=batch_size,
                 num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance, )
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
 
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
@@ -412,15 +394,14 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
 
@@ -436,7 +417,8 @@ def __call__(
             is_strength_max=is_strength_max,
             return_noise=True,
             return_image_latents=True,
-            infer_op=infer_op_dict.get("vae_encoder", None), )
+            infer_op=infer_op_dict.get("vae_encoder", None),
+        )
 
         # 6. Prepare mask latent variables
         mask = self.prepare_mask_latents(
@@ -447,52 +429,47 @@ def __call__(
             width,
             do_classifier_free_guidance,
             return_masked_image_latents=False,
-            infer_op=infer_op_dict.get("vae_encoder", None), )
+            infer_op=infer_op_dict.get("vae_encoder", None),
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         if do_classifier_free_guidance:
-            init_mask = mask[:mask.shape[0] // 2]
+            init_mask = mask[: mask.shape[0] // 2]
         else:
             init_mask = mask
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 unet_inputs = dict(
                     sample=latent_model_input,
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape, )
+                    output_shape=latent_model_input.shape,
+                )
                 if do_controlnet:
                     unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs[
-                        "controlnet_conditioning_scale"] = control_conditioning_scale
+                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
                 # predict the noise residual
                 noise_pred_unet = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
 
@@ -504,32 +481,27 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
 
                 if i < len(timesteps) - 1:
                     # masking
                     if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(
-                            image_latents, noise_pred_uncond, t)
+                        init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
                     else:
                         # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
                         noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            image_latents, noise, noise_timestep)
+                        init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
                 else:
                     init_latents_proper = image_latents
 
-                latents = (1 - init_mask
-                           ) * init_latents_proper + init_mask * latents
+                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -540,7 +512,8 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
             image, has_nsfw_concept = self.run_safety_checker(image)
         else:
             image = latents
@@ -551,11 +524,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
index 7f66d4caec169..d2c9622fd7c8a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
@@ -19,16 +19,17 @@
 import PIL.Image
 
 from ...utils import logging
-from .pipeline_fastdeploy_cycle_diffusion import \
-    FastDeployCycleDiffusionPipeline
-from .pipeline_fastdeploy_stable_diffusion import \
-    FastDeployStableDiffusionPipeline
-from .pipeline_fastdeploy_stable_diffusion_img2img import \
-    FastDeployStableDiffusionImg2ImgPipeline
-from .pipeline_fastdeploy_stable_diffusion_inpaint import \
-    FastDeployStableDiffusionInpaintPipeline
-from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import \
-    FastDeployStableDiffusionInpaintPipelineLegacy
+from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline
+from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
+from .pipeline_fastdeploy_stable_diffusion_img2img import (
+    FastDeployStableDiffusionImg2ImgPipeline,
+)
+from .pipeline_fastdeploy_stable_diffusion_inpaint import (
+    FastDeployStableDiffusionInpaintPipeline,
+)
+from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import (
+    FastDeployStableDiffusionInpaintPipelineLegacy,
+)
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -63,45 +64,39 @@ class FastDeployStableDiffusionMegaPipeline(FastDeployStableDiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
-    _optional_components = [
-        "vae_encoder", "safety_checker", "feature_extractor"
-    ]
+    _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
 
     def __call__(self, *args, **kwargs):
         return self.text2img(*args, **kwargs)
 
     def text2img(
-            self,
-            prompt: Union[str, List[str]],
-            height: Optional[int]=512,
-            width: Optional[int]=512,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
 
-        expected_components = inspect.signature(
-            FastDeployStableDiffusionPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        expected_components = inspect.signature(FastDeployStableDiffusionPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = FastDeployStableDiffusionPipeline(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         temp_pipeline._progress_bar_config = self._progress_bar_config
         output = temp_pipeline(
             prompt=prompt,
@@ -122,42 +117,39 @@ def text2img(
             callback_steps=callback_steps,
             controlnet_cond=controlnet_cond,
             controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict, )
+            infer_op_dict=infer_op_dict,
+        )
         return output
 
     def img2img(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
-        expected_components = inspect.signature(
-            FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
+        expected_components = inspect.signature(FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = FastDeployStableDiffusionImg2ImgPipeline(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         temp_pipeline._progress_bar_config = self._progress_bar_config
         output = temp_pipeline(
             prompt=prompt,
@@ -180,48 +172,46 @@ def img2img(
             callback_steps=callback_steps,
             controlnet_cond=controlnet_cond,
             controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict, )
+            infer_op_dict=infer_op_dict,
+        )
 
         return output
 
     def inpaint_legacy(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            mask_image: Union[paddle.Tensor, PIL.Image.Image],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        mask_image: Union[paddle.Tensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         assert (
             self.unet_num_latent_channels == 4
         ), f"Detected `unet_num_latent_channels` is {self.unet_num_latent_channels}, Plese use `inpaint` method."
         expected_components = inspect.signature(
-            FastDeployStableDiffusionInpaintPipelineLegacy.
-            __init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+            FastDeployStableDiffusionInpaintPipelineLegacy.__init__
+        ).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = FastDeployStableDiffusionInpaintPipelineLegacy(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         temp_pipeline._progress_bar_config = self._progress_bar_config
         output = temp_pipeline(
             prompt=prompt,
@@ -245,45 +235,42 @@ def inpaint_legacy(
             callback_steps=callback_steps,
             controlnet_cond=controlnet_cond,
             controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict, )
+            infer_op_dict=infer_op_dict,
+        )
 
         return output
 
     def inpaint(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            mask_image: Union[paddle.Tensor, PIL.Image.Image],
-            height=None,
-            width=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None,
-            controlnet_conditioning_scale: float=1.0,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        mask_image: Union[paddle.Tensor, PIL.Image.Image],
+        height=None,
+        width=None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         assert self.unet_num_latent_channels in [4, 9]
-        expected_components = inspect.signature(
-            FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        expected_components = inspect.signature(FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = FastDeployStableDiffusionInpaintPipeline(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         temp_pipeline._progress_bar_config = self._progress_bar_config
         output = temp_pipeline(
             prompt=prompt,
@@ -307,46 +294,42 @@ def inpaint(
             callback_steps=callback_steps,
             controlnet_cond=controlnet_cond,
             controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict, )
+            infer_op_dict=infer_op_dict,
+        )
 
         return output
 
     def cycle_diffusion(
-            self,
-            prompt: Union[str, List[str]],
-            source_prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[paddle.Tensor]=None,
-            source_guidance_scale: Optional[float]=1,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.1,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            infer_op_dict: Dict[str, str]=None, ):
-        expected_components = inspect.signature(
-            FastDeployCycleDiffusionPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[paddle.Tensor] = None,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        infer_op_dict: Dict[str, str] = None,
+    ):
+        expected_components = inspect.signature(FastDeployCycleDiffusionPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = FastDeployCycleDiffusionPipeline(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         temp_pipeline._progress_bar_config = self._progress_bar_config
         output = temp_pipeline(
             prompt=prompt,
@@ -371,6 +354,7 @@ def cycle_diffusion(
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            infer_op_dict=infer_op_dict, )
+            infer_op_dict=infer_op_dict,
+        )
 
         return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
index 05ff6fa970504..db0660a1cbb90 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
@@ -21,24 +21,23 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDPMScheduler
 from ...utils import logging
-from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin,
-                                FastDeployRuntimeModel)
+from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
 from ..pipeline_utils import ImagePipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class FastDeployStableDiffusionUpscalePipeline(
-        DiffusionPipeline, FastDeployDiffusionPipelineMixin):
+class FastDeployStableDiffusionUpscalePipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
     def __init__(
-            self,
-            vae: FastDeployRuntimeModel,
-            text_encoder: FastDeployRuntimeModel,
-            tokenizer: Any,
-            unet: FastDeployRuntimeModel,
-            low_res_scheduler: DDPMScheduler,
-            scheduler: Any,
-            max_noise_level: int=350, ):
+        self,
+        vae: FastDeployRuntimeModel,
+        text_encoder: FastDeployRuntimeModel,
+        tokenizer: Any,
+        unet: FastDeployRuntimeModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: Any,
+        max_noise_level: int = 350,
+    ):
         super().__init__(
             vae=vae,
             text_encoder=text_encoder,
@@ -49,18 +48,19 @@ def __init__(
             safety_checker=None,
             feature_extractor=None,
             watermarker=None,
-            max_noise_level=max_noise_level, )
+            max_noise_level=max_noise_level,
+        )
         self.post_init(vae_scaling_factor=0.08333)
 
     def check_inputs(self, prompt, image, noise_level, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
             )
@@ -83,39 +83,38 @@ def check_inputs(self, prompt, image, noise_level, callback_steps):
 
         # check noise level
         if noise_level > self.config.max_noise_level:
-            raise ValueError(
-                f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}"
-            )
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
-            num_inference_steps: int=75,
-            guidance_scale: float=9.0,
-            noise_level: int=20,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            parse_prompt_type: Optional[str]="lpw",
-            max_embeddings_multiples: Optional[int]=3,
-            prompt_embeds: Optional[np.ndarray]=None,
-            negative_prompt_embeds: Optional[np.ndarray]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            infer_op_dict: Dict[str, str]=None, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        parse_prompt_type: Optional[str] = "lpw",
+        max_embeddings_multiples: Optional[int] = 3,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        infer_op_dict: Dict[str, str] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -204,7 +203,8 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             parse_prompt_type=parse_prompt_type,
             max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None), )
+            infer_op=infer_op_dict.get("text_encoder", None),
+        )
 
         # 4. Preprocess image
         image = self.image_processor.preprocess(image)
@@ -215,13 +215,11 @@ def __call__(
 
         # 5. Add noise to image
         noise_level = paddle.to_tensor([noise_level], dtype="int64")
-        noise = paddle.randn(
-            image.shape, generator=generator, dtype=text_embeddings.dtype)
+        noise = paddle.randn(image.shape, generator=generator, dtype=text_embeddings.dtype)
         image = self.low_res_scheduler.add_noise(image, noise, noise_level)
 
         batch_multiplier = 2 if do_classifier_free_guidance else 1
-        image = paddle.concat([image] * batch_multiplier *
-                              num_images_per_prompt)
+        image = paddle.concat([image] * batch_multiplier * num_images_per_prompt)
         noise_level = paddle.concat([noise_level] * image.shape[0])
 
         # 6. Prepare latent variables
@@ -231,7 +229,8 @@ def __call__(
             height,
             width,
             generator,
-            latents, )
+            latents,
+        )
         NUM_UNET_INPUT_CHANNELS = self.unet_num_latent_channels
         NUM_LATENT_CHANNELS = self.vae_decoder_num_latent_channels
 
@@ -243,27 +242,24 @@ def __call__(
                 f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +"
                 f" `num_channels_image`: {num_channels_image} "
                 f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input.")
+                " `pipeline.unet` or your `image` input."
+            )
 
         # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         is_scheduler_support_step_index = self.is_scheduler_support_step_index()
 
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
                 if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t, step_index=i)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
                 else:
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 unet_inputs = dict(
                     sample=paddle.concat(
@@ -272,16 +268,15 @@ def __call__(
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
                     infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape, )
+                    output_shape=latent_model_input.shape,
+                )
                 # predict the noise residual
                 noise_pred_unet = self.unet(**unet_inputs)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(
-                        2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 else:
                     noise_pred = noise_pred_unet
 
@@ -293,16 +288,14 @@ def __call__(
                         latents,
                         step_index=i,
                         return_pred_original_sample=False,
-                        **extra_step_kwargs, )
+                        **extra_step_kwargs,
+                    )
                 else:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, **extra_step_kwargs)
+                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
                 latents = scheduler_output.prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -313,16 +306,18 @@ def __call__(
         if not output_type == "latent":
             image = self._decode_vae_latents(
                 latents / self.vae_scaling_factor,
-                infer_op=infer_op_dict.get("vae_decoder", None), )
+                infer_op=infer_op_dict.get("vae_decoder", None),
+            )
         else:
             image = latents
 
         do_denormalize = [True] * image.shape[0]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
-        return ImagePipelineOutput(images=image, )
+        return ImagePipelineOutput(
+            images=image,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 75f8db28f0c67..b847facb71074 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -18,16 +18,13 @@
 
 import paddle
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
-                        TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (deprecate, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -48,8 +45,7 @@
 """
 
 
-class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
-                              LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
 
@@ -87,37 +83,33 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -125,11 +117,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -150,12 +138,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -166,12 +152,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -183,18 +166,20 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -234,29 +219,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -264,8 +251,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -275,21 +261,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -297,46 +284,42 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -355,53 +338,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -414,17 +393,19 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -447,25 +428,25 @@ def prepare_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -546,7 +527,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -568,7 +550,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -583,43 +566,38 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -632,8 +610,7 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
@@ -642,11 +619,9 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
index 7b6cf35b03da0..0ec8990c31e59 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
@@ -18,15 +18,12 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 # from ...loaders import TextualInversionLoaderMixin
-from ...models import (AutoencoderKL, MultiAdapter, T2IAdapter,
-                       UNet2DConditionModel)
+from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -81,8 +78,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
     else:
         h = int(round(img_size / 8 / coef) * 8)
 
-    images = images.resize(
-        (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
 
     return images
 
@@ -95,12 +91,8 @@ def preprocess(image):
     if isinstance(image[0], PIL.Image.Image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image
-        ]
-        image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...])
-                 for i in image]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -155,17 +147,18 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            adapter_weights: Optional[List[float]]=None,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        adapter_weights: Optional[List[float]] = None,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -185,8 +178,9 @@ def __init__(
             adapter=adapter,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_vae_slicing(self):
@@ -206,13 +200,14 @@ def disable_vae_slicing(self):
         self.vae.disable_slicing()
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         """
         Encodes the prompt into text encoder hidden states.
 
@@ -249,32 +244,29 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
-            prompt_embeds = self.text_encoder(
-                text_input_ids, attention_mask=attention_mask)
+            prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
             prompt_embeds = prompt_embeds[0]
         prompt_embeds = prompt_embeds.astype(self.text_encoder.dtype)
         bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.tile(
-            repeat_times=[1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
             if negative_prompt is None:
@@ -300,34 +292,28 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+                return_tensors="pd",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids, attention_mask=attention_mask)
+            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)
             negative_prompt_embeds = negative_prompt_embeds[0]
         if do_classifier_free_guidance:
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.astype(
-                self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                repeat_times=[1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                (batch_size * num_images_per_prompt, seq_len, -1))
-            prompt_embeds = paddle.concat(
-                x=[negative_prompt_embeds, prompt_embeds])
+            negative_prompt_embeds = negative_prompt_embeds.astype(self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1))
+            prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
         return prompt_embeds
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.astype(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.astype(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -336,37 +322,36 @@ def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype(
-            dtype="float32").numpy()
+        image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype(dtype="float32").numpy()
         return image
 
     def prepare_extra_step_kwargs(self, generator, eta):
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
-        if (callback_steps is None or callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (
+            callback_steps is None
+            or callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
             )
@@ -378,11 +363,8 @@ def check_inputs(
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
@@ -394,19 +376,21 @@ def check_inputs(
                 )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
@@ -438,28 +422,27 @@ def _default_height_width(self, height, width, image):
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image, List[
-                PIL.Image.Image]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            adapter_conditioning_scale: Union[float, List[float]]=1.0, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -550,13 +533,13 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
         is_multi_adapter = isinstance(self.adapter, MultiAdapter)
         if is_multi_adapter:
             adapter_input = [preprocess(img) for img in image]
             n, c, h, w = adapter_input[0].shape
-            adapter_input = paddle.stack(
-                x=[x.reshape([n * c, h, w]) for x in adapter_input])
+            adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
         else:
             adapter_input = preprocess(image)
         adapter_input = adapter_input.astype(self.adapter.dtype)
@@ -573,7 +556,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
         self.scheduler.set_timesteps(num_inference_steps)
         timesteps = self.scheduler.timesteps
         num_channels_latents = self.unet.in_channels
@@ -584,43 +568,35 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         adapter_state = self.adapter(adapter_input)
         for k, v in enumerate(adapter_state):
             adapter_state[k] = v * adapter_conditioning_scale
         if num_images_per_prompt > 1:
             for k, v in enumerate(adapter_state):
-                adapter_state[k] = v.tile(
-                    repeat_times=[num_images_per_prompt, 1, 1, 1])
+                adapter_state[k] = v.tile(repeat_times=[num_images_per_prompt, 1, 1, 1])
         if do_classifier_free_guidance:
             for k, v in enumerate(adapter_state):
                 adapter_state[k] = paddle.concat(x=[v] * 2, axis=0)
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                latent_model_input = (paddle.concat(x=[latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    down_block_additional_residuals=[
-                        state.clone() for state in adapter_state
-                    ], ).sample
+                    down_block_additional_residuals=[state.clone() for state in adapter_state],
+                ).sample
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(
-                        chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
-                if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -629,14 +605,11 @@ def __call__(
             has_nsfw_concept = None
         elif output_type == "pil":
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
             image = self.numpy_to_pil(image)
         else:
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
         if not return_dict:
             return image, has_nsfw_concept
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
index 3deff63114cd2..3971ea99471d6 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
@@ -25,17 +25,20 @@
 import PIL
 import PIL.Image
 from packaging import version
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
-                        TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import (
-    DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler)
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ...utils.testing_utils import load_image
 from . import StableDiffusionPipelineOutput
@@ -86,7 +89,8 @@ def save_all(images, FORMAT="jpg", OUTDIR="./outputs/"):
 [^\\()\[\]:]+|
 :
 """,
-    re.X, )
+    re.X,
+)
 
 
 def parse_prompt_attention(text):
@@ -175,9 +179,7 @@ def multiply_range(start_position, multiplier):
     return res
 
 
-def get_prompts_with_weights(pipe: DiffusionPipeline,
-                             prompt: List[str],
-                             max_length: int):
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
 
@@ -212,32 +214,20 @@ def get_prompts_with_weights(pipe: DiffusionPipeline,
         tokens.append(text_token)
         weights.append(text_weight)
     if truncated:
-        logger.warning(
-            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
-        )
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
 
 
-def pad_tokens_and_weights(tokens,
-                           weights,
-                           max_length,
-                           bos,
-                           eos,
-                           pad,
-                           no_boseos_middle=True,
-                           chunk_length=77):
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
     r"""
     Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
     """
     max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = (max_length if no_boseos_middle else
-                      max_embeddings_multiples * chunk_length)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
     for i in range(len(tokens)):
-        tokens[i] = ([bos] + tokens[i] + [eos] + [pad] *
-                     (max_length - 2 - len(tokens[i])))
+        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
         if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 -
-                                                       len(weights[i]))
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
         else:
             w = []
             if len(weights[i]) == 0:
@@ -245,8 +235,7 @@ def pad_tokens_and_weights(tokens,
             else:
                 for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2):min(
-                        len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
@@ -255,10 +244,11 @@ def pad_tokens_and_weights(tokens,
 
 
 def get_unweighted_text_embeddings(
-        pipe: DiffusionPipeline,
-        text_input: paddle.Tensor,
-        chunk_length: int,
-        no_boseos_middle: Optional[bool]=True, ):
+    pipe: DiffusionPipeline,
+    text_input: paddle.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
     it should be split into chunks and sent to the text encoder individually.
@@ -268,8 +258,7 @@ def get_unweighted_text_embeddings(
         text_embeddings = []
         for i in range(max_embeddings_multiples):
             # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (
-                chunk_length - 2) + 2].clone()
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
 
             # cover the head and the tail by the starting and the ending tokens
             text_input_chunk[:, 0] = text_input[0, 0]
@@ -296,14 +285,15 @@ def get_unweighted_text_embeddings(
 
 
 def get_weighted_text_embeddings(
-        pipe: DiffusionPipeline,
-        prompt: Union[str, List[str]],
-        uncond_prompt: Optional[Union[str, List[str]]]=None,
-        max_embeddings_multiples: Optional[int]=1,
-        no_boseos_middle: Optional[bool]=False,
-        skip_parsing: Optional[bool]=False,
-        skip_weighting: Optional[bool]=False,
-        **kwargs, ):
+    pipe: DiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 1,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
+):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
     prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
@@ -329,24 +319,19 @@ def get_weighted_text_embeddings(
         skip_weighting (`bool`, *optional*, defaults to `False`):
             Skip the weighting. When the parsing is skipped, it is forced True.
     """
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
     if isinstance(prompt, str):
         prompt = [prompt]
 
     if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt,
-                                                                 max_length - 2)
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
         if uncond_prompt is not None:
             if isinstance(uncond_prompt, str):
                 uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(
-                pipe, uncond_prompt, max_length - 2)
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
     else:
         prompt_tokens = [
-            token[1:-1]
-            for token in pipe.tokenizer(
-                prompt, max_length=max_length, truncation=True).input_ids
+            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
         ]
         prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
         if uncond_prompt is not None:
@@ -354,33 +339,26 @@ def get_weighted_text_embeddings(
                 uncond_prompt = [uncond_prompt]
             uncond_tokens = [
                 token[1:-1]
-                for token in pipe.tokenizer(
-                    uncond_prompt, max_length=max_length, truncation=True)
-                .input_ids
+                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
             ]
             uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
 
     # round up the longest length of tokens to a multiple of (model_max_length - 2)
     max_length = max([len(token) for token in prompt_tokens])
     if uncond_prompt is not None:
-        max_length = max(max_length,
-                         max([len(token) for token in uncond_tokens]))
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
 
     max_embeddings_multiples = min(
         max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, )
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2
-                  ) * max_embeddings_multiples + 2
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
 
     # pad the length of tokens and weights
     # support bert tokenizer
-    bos = (pipe.tokenizer.bos_token_id
-           if pipe.tokenizer.bos_token_id is not None else
-           pipe.tokenizer.cls_token_id)
-    eos = (pipe.tokenizer.eos_token_id
-           if pipe.tokenizer.eos_token_id is not None else
-           pipe.tokenizer.sep_token_id)
+    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
+    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
     pad = pipe.tokenizer.pad_token_id
     prompt_tokens, prompt_weights = pad_tokens_and_weights(
         prompt_tokens,
@@ -390,7 +368,8 @@ def get_weighted_text_embeddings(
         eos,
         pad,
         no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length, )
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
     prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
     if uncond_prompt is not None:
         uncond_tokens, uncond_weights = pad_tokens_and_weights(
@@ -401,7 +380,8 @@ def get_weighted_text_embeddings(
             eos,
             pad,
             no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length, )
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
         uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
 
     # get the embeddings
@@ -409,30 +389,28 @@ def get_weighted_text_embeddings(
         pipe,
         prompt_tokens,
         pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle, )
-    prompt_weights = paddle.to_tensor(
-        prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype)
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = paddle.to_tensor(prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
             pipe,
             uncond_tokens,
             pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle, )
-        uncond_weights = paddle.to_tensor(
-            uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype)
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = paddle.to_tensor(uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype)
 
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
     if (not skip_parsing) and (not skip_weighting):
         previous_mean = text_embeddings.mean(axis=[-2, -1])
         text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= previous_mean / text_embeddings.mean(
-            axis=[-2, -1], keepdim=True)
+        text_embeddings *= previous_mean / text_embeddings.mean(axis=[-2, -1], keepdim=True)
         if uncond_prompt is not None:
             previous_mean = uncond_embeddings.mean(axis=[-2, -1])
             uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= previous_mean / uncond_embeddings.mean(
-                axis=[-2, -1], keepdim=True)
+            uncond_embeddings *= previous_mean / uncond_embeddings.mean(axis=[-2, -1], keepdim=True)
 
     if uncond_prompt is not None:
         return text_embeddings, uncond_embeddings
@@ -453,9 +431,7 @@ def preprocess_mask(mask, scale_factor=8):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize(
-        (w // scale_factor, h // scale_factor),
-        resample=PIL_INTERPOLATION["nearest"])
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
@@ -464,9 +440,7 @@ def preprocess_mask(mask, scale_factor=8):
     return mask
 
 
-class StableDiffusionPipelineAllinOne(DiffusionPipeline,
-                                      TextualInversionLoaderMixin,
-                                      LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionPipelineAllinOne(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
     r"""
     Pipeline for text-to-image image-to-image inpainting generation using Stable Diffusion.
 
@@ -497,38 +471,38 @@ class StableDiffusionPipelineAllinOne(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
-                             EulerDiscreteScheduler,
-                             EulerAncestralDiscreteScheduler,
-                             DPMSolverMultistepScheduler, ],
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            requires_safety_checker: bool=False, ):
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = False,
+    ):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -536,11 +510,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -559,12 +529,10 @@ def __init__(
                 f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -575,12 +543,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -592,7 +557,8 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
         self.__init__additional__()
@@ -602,7 +568,8 @@ def __init__additional__(self):
             setattr(
                 self,
                 "vae_scale_factor",
-                2**(len(self.vae.config.block_out_channels) - 1), )
+                2 ** (len(self.vae.config.block_out_channels) - 1),
+            )
 
     def __call__(self, *args, **kwargs):
         return self.text2image(*args, **kwargs)
@@ -611,16 +578,17 @@ def text2img(self, *args, **kwargs):
         return self.text2image(*args, **kwargs)
 
     def _encode_prompt(
-            self,
-            prompt,
-            negative_prompt,
-            max_embeddings_multiples,
-            no_boseos_middle,
-            skip_parsing,
-            skip_weighting,
-            do_classifier_free_guidance,
-            num_images_per_prompt,
-            **kwargs, ):
+        self,
+        prompt,
+        negative_prompt,
+        max_embeddings_multiples,
+        no_boseos_middle,
+        skip_parsing,
+        skip_weighting,
+        do_classifier_free_guidance,
+        num_images_per_prompt,
+        **kwargs,
+    ):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         if negative_prompt is None:
@@ -631,41 +599,37 @@ def _encode_prompt(
             raise ValueError(
                 f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                 f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`.")
+                " the batch size of `prompt`."
+            )
 
         text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
             pipe=self,
             prompt=prompt,
-            uncond_prompt=negative_prompt
-            if do_classifier_free_guidance else None,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
             max_embeddings_multiples=max_embeddings_multiples,
             no_boseos_middle=no_boseos_middle,
             skip_parsing=skip_parsing,
             skip_weighting=skip_weighting,
-            **kwargs, )
+            **kwargs,
+        )
         bs_embed, seq_len, _ = text_embeddings.shape
         text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile(
-                [1, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
-            text_embeddings = paddle.concat(
-                [uncond_embeddings, text_embeddings])
+            uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1])
+            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
+            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
 
         return text_embeddings
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -684,8 +648,7 @@ def prepare_extra_step_kwargs(self, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
@@ -694,61 +657,47 @@ def prepare_extra_step_kwargs(self, eta):
 
     def check_inputs_text2img(self, prompt, height, width, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     def check_inputs_img2img_inpaint(self, prompt, strength, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [1.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
-
-    def prepare_latents_text2img(self,
-                                 batch_size,
-                                 num_channels_latents,
-                                 height,
-                                 width,
-                                 dtype,
-                                 latents=None):
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents_text2img(self, batch_size, num_channels_latents, height, width, dtype, latents=None):
         shape = [batch_size, num_channels_latents, height // 8, width // 8]
         if latents is None:
             latents = paddle.randn(shape, dtype=dtype)
         else:
             if latents.shape != shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    def prepare_latents_img2img(self, image, timestep, num_images_per_prompt,
-                                dtype):
+    def prepare_latents_img2img(self, image, timestep, num_images_per_prompt, dtype):
         image = image.cast(dtype=dtype)
         init_latent_dist = self.vae.encode(image).latent_dist
         init_latents = init_latent_dist.sample()
@@ -756,8 +705,7 @@ def prepare_latents_img2img(self, image, timestep, num_images_per_prompt,
 
         b, c, h, w = init_latents.shape
         init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1])
-        init_latents = init_latents.reshape(
-            [b * num_images_per_prompt, c, h, w])
+        init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w])
 
         # add noise to latents using the timesteps
         noise = paddle.randn(init_latents.shape, dtype=dtype)
@@ -779,8 +727,7 @@ def get_timesteps(self, num_inference_steps, strength):
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt,
-                                dtype):
+    def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, dtype):
         image = image.cast(dtype)
         init_latent_dist = self.vae.encode(image).latent_dist
         init_latents = init_latent_dist.sample()
@@ -788,8 +735,7 @@ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt,
 
         b, c, h, w = init_latents.shape
         init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1])
-        init_latents = init_latents.reshape(
-            [b * num_images_per_prompt, c, h, w])
+        init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w])
 
         init_latents_orig = init_latents
 
@@ -801,27 +747,28 @@ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt,
 
     @paddle.no_grad()
     def text2image(
-            self,
-            prompt: Union[str, List[str]],
-            height: int=512,
-            width: int=512,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            seed: Optional[int]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            # new add
-            max_embeddings_multiples: Optional[int]=3,
-            no_boseos_middle: Optional[bool]=False,
-            skip_parsing: Optional[bool]=False,
-            skip_weighting: Optional[bool]=False,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        seed: Optional[int] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        # new add
+        max_embeddings_multiples: Optional[int] = 3,
+        no_boseos_middle: Optional[bool] = False,
+        skip_parsing: Optional[bool] = False,
+        skip_weighting: Optional[bool] = False,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -891,7 +838,8 @@ def text2image(
             no_boseos_middle=no_boseos_middle,
             skip_parsing=skip_parsing,
             skip_weighting=skip_weighting,
-            epoch_time=time.time(), )
+            epoch_time=time.time(),
+        )
         paddle.seed(seed)
         # 1. Check inputs. Raise error if not correct
         self.check_inputs_text2img(prompt, height, width, callback_steps)
@@ -912,7 +860,8 @@ def text2image(
             skip_parsing,
             skip_weighting,
             do_classifier_free_guidance,
-            num_images_per_prompt, )
+            num_images_per_prompt,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -926,42 +875,33 @@ def text2image(
             height,
             width,
             text_embeddings.dtype,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -970,8 +910,7 @@ def text2image(
         image = self.decode_latents(latents)
 
         # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          text_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
 
         # 10. Convert to PIL
         if output_type == "pil":
@@ -980,33 +919,33 @@ def text2image(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     def img2img(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            strength: float=0.8,
-            height=None,
-            width=None,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            seed: Optional[int]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            # new add
-            max_embeddings_multiples: Optional[int]=1,
-            no_boseos_middle: Optional[bool]=False,
-            skip_parsing: Optional[bool]=False,
-            skip_weighting: Optional[bool]=False,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        strength: float = 0.8,
+        height=None,
+        width=None,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        seed: Optional[int] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        # new add
+        max_embeddings_multiples: Optional[int] = 1,
+        no_boseos_middle: Optional[bool] = False,
+        skip_parsing: Optional[bool] = False,
+        skip_weighting: Optional[bool] = False,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -1093,7 +1032,8 @@ def img2img(
             no_boseos_middle=no_boseos_middle,
             skip_parsing=skip_parsing,
             skip_weighting=skip_weighting,
-            epoch_time=time.time(), )
+            epoch_time=time.time(),
+        )
         paddle.seed(seed)
 
         # 1. Check inputs
@@ -1115,7 +1055,8 @@ def img2img(
             skip_parsing,
             skip_weighting,
             do_classifier_free_guidance,
-            num_images_per_prompt, )
+            num_images_per_prompt,
+        )
 
         # 4. Preprocess image
         if isinstance(image, PIL.Image.Image):
@@ -1124,50 +1065,36 @@ def img2img(
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
-        latents = self.prepare_latents_img2img(image, latent_timestep,
-                                               num_images_per_prompt,
-                                               text_embeddings.dtype)
+        latents = self.prepare_latents_img2img(image, latent_timestep, num_images_per_prompt, text_embeddings.dtype)
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -1176,8 +1103,7 @@ def img2img(
         image = self.decode_latents(latents)
 
         # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          text_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
 
         # 11. Convert to PIL
         if output_type == "pil":
@@ -1186,34 +1112,34 @@ def img2img(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     def inpaint(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            mask_image: Union[paddle.Tensor, PIL.Image.Image],
-            height=None,
-            width=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            seed: Optional[int]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            # new add
-            max_embeddings_multiples: Optional[int]=1,
-            no_boseos_middle: Optional[bool]=False,
-            skip_parsing: Optional[bool]=False,
-            skip_weighting: Optional[bool]=False,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        mask_image: Union[paddle.Tensor, PIL.Image.Image],
+        height=None,
+        width=None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        seed: Optional[int] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        # new add
+        max_embeddings_multiples: Optional[int] = 1,
+        no_boseos_middle: Optional[bool] = False,
+        skip_parsing: Optional[bool] = False,
+        skip_weighting: Optional[bool] = False,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -1309,7 +1235,8 @@ def inpaint(
             no_boseos_middle=no_boseos_middle,
             skip_parsing=skip_parsing,
             skip_weighting=skip_weighting,
-            epoch_time=time.time(), )
+            epoch_time=time.time(),
+        )
         paddle.seed(seed)
 
         # 1. Check inputs
@@ -1331,7 +1258,8 @@ def inpaint(
             skip_parsing,
             skip_weighting,
             do_classifier_free_guidance,
-            num_images_per_prompt, )
+            num_images_per_prompt,
+        )
 
         if not isinstance(image, paddle.Tensor):
             image = image.resize((width, height))
@@ -1343,16 +1271,14 @@ def inpaint(
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
         # encode the init image into latents and scale the latents
         latents, init_latents_orig, noise = self.prepare_latents_inpaint(
-            image, latent_timestep, num_images_per_prompt,
-            text_embeddings.dtype)
+            image, latent_timestep, num_images_per_prompt, text_embeddings.dtype
+        )
 
         # 7. Prepare mask latent
         mask = mask_image.cast(latents.dtype)
@@ -1362,41 +1288,30 @@ def inpaint(
         extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
                 # masking
-                init_latents_proper = self.scheduler.add_noise(
-                    init_latents_orig, noise, t)
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
 
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -1405,8 +1320,7 @@ def inpaint(
         image = self.decode_latents(latents)
 
         # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          text_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
 
         # 12. Convert to PIL
         if output_type == "pil":
@@ -1415,8 +1329,7 @@ def inpaint(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     @staticmethod
     def numpy_to_pil(images, **kwargs):
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index 25099d6d6c726..4e5e08168878d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -21,8 +21,7 @@
 import paddle
 import paddle.nn as nn
 from paddle.nn import functional as F
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -100,8 +99,7 @@ def aggregate_attention(self, from_where: List[str]) -> paddle.Tensor:
         attention_maps = self.get_average_attention()
         for location in from_where:
             for item in attention_maps[location]:
-                cross_maps = item.reshape(
-                    [-1, self.attn_res[0], self.attn_res[1], item.shape[-1]])
+                cross_maps = item.reshape([-1, self.attn_res[0], self.attn_res[1], item.shape[-1]])
                 out.append(cross_maps)
         out = paddle.concat(out, axis=0)
         out = out.sum(0) / out.shape[0]
@@ -132,21 +130,19 @@ def __init__(self, attnstore, place_in_unet):
         self.place_in_unet = place_in_unet
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None, ):
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
         batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
 
         query = attn.to_q(hidden_states)
 
         is_cross = encoder_hidden_states is not None
-        encoder_hidden_states = (encoder_hidden_states
-                                 if encoder_hidden_states is not None else
-                                 hidden_states)
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
@@ -160,8 +156,7 @@ def __call__(
         if not attention_probs.stop_gradient:
             # TODO must flatten （0, 1)
             # [bs, num_heads, q_len, k_len] -> [bs*num_heads, q_len, k_len]
-            self.attnstore(
-                attention_probs.flatten(0, 1), is_cross, self.place_in_unet)
+            self.attnstore(attention_probs.flatten(0, 1), is_cross, self.place_in_unet)
 
         hidden_states = paddle.matmul(attention_probs, value)
         hidden_states = attn.batch_to_head_dim(hidden_states)
@@ -174,8 +169,7 @@ def __call__(
         return hidden_states
 
 
-class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline,
-                                             TextualInversionLoaderMixin):
+class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion and Attend and Excite.
 
@@ -205,15 +199,16 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -239,19 +234,21 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -291,29 +288,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -321,8 +320,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -332,21 +330,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -354,47 +353,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -415,54 +410,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            indices,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        indices,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -475,22 +466,19 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
-        indices_is_list_ints = isinstance(indices, list) and isinstance(
-            indices[0], int)
-        indices_is_list_list_ints = (isinstance(indices, list) and
-                                     isinstance(indices[0], list) and
-                                     isinstance(indices[0][0], int))
+        indices_is_list_ints = isinstance(indices, list) and isinstance(indices[0], int)
+        indices_is_list_list_ints = (
+            isinstance(indices, list) and isinstance(indices[0], list) and isinstance(indices[0][0], int)
+        )
 
         if not indices_is_list_ints and not indices_is_list_list_ints:
-            raise TypeError(
-                "`indices` must be a list of ints or a list of a list of ints")
+            raise TypeError("`indices` must be a list of ints or a list of a list of ints")
 
-        if (indices is None) or (indices is not None and
-                                 not isinstance(indices, List)):
-            raise ValueError(
-                f"`indices` has to be a list but is {type(indices)}")
+        if (indices is None) or (indices is not None and not isinstance(indices, List)):
+            raise ValueError(f"`indices` has to be a list but is {type(indices)}")
 
         if indices_is_list_ints:
             indices_batch_size = 1
@@ -511,19 +499,21 @@ def check_inputs(
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -539,8 +529,9 @@ def prepare_latents(
 
     @staticmethod
     def _compute_max_attention_per_index(
-            attention_maps: paddle.Tensor,
-            indices: List[int], ) -> List[paddle.Tensor]:
+        attention_maps: paddle.Tensor,
+        indices: List[int],
+    ) -> List[paddle.Tensor]:
         """Computes the maximum attention value for each of the tokens we wish to alter."""
         attention_for_text = attention_maps[:, :, 1:-1]
         attention_for_text *= 100
@@ -554,38 +545,35 @@ def _compute_max_attention_per_index(
         for i in indices:
             image = attention_for_text[:, :, i]
             smoothing = GaussianSmoothing()
-            input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1),
-                          mode="reflect")
+            input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
             image = smoothing(input).squeeze(0).squeeze(0)
             # paddle.max donot support float16
             max_indices_list.append(image.max())
         return max_indices_list
 
     def _aggregate_and_get_max_attention_per_token(
-            self,
-            indices: List[int], ):
+        self,
+        indices: List[int],
+    ):
         """Aggregates the attention for each token and computes the max activation value for each token to alter."""
         attention_maps = self.attention_store.aggregate_attention(
-            from_where=("up", "down", "mid"), )
+            from_where=("up", "down", "mid"),
+        )
         max_attention_per_index = self._compute_max_attention_per_index(
             attention_maps=attention_maps,
-            indices=indices, )
+            indices=indices,
+        )
         return max_attention_per_index
 
     @staticmethod
-    def _compute_loss(
-            max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor:
+    def _compute_loss(max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor:
         """Computes the attend-and-excite loss using the maximum attention value for each token."""
-        losses = [
-            max(0, 1.0 - curr_max) for curr_max in max_attention_per_index
-        ]
+        losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index]
         loss = max(losses)
         return loss
 
     @staticmethod
-    def _update_latent(latents: paddle.Tensor,
-                       loss: paddle.Tensor,
-                       step_size: float) -> paddle.Tensor:
+    def _update_latent(latents: paddle.Tensor, loss: paddle.Tensor, step_size: float) -> paddle.Tensor:
         """Update the latent according to the computed loss."""
         loss.stop_gradient = False
         grad_cond = paddle.autograd.grad(loss, [latents], retain_graph=True)[0]
@@ -593,15 +581,16 @@ def _update_latent(latents: paddle.Tensor,
         return latents
 
     def _perform_iterative_refinement_step(
-            self,
-            latents: paddle.Tensor,
-            indices: List[int],
-            loss: paddle.Tensor,
-            threshold: float,
-            text_embeddings: paddle.Tensor,
-            step_size: float,
-            t: int,
-            max_refinement_steps: int=20, ):
+        self,
+        latents: paddle.Tensor,
+        indices: List[int],
+        loss: paddle.Tensor,
+        threshold: float,
+        text_embeddings: paddle.Tensor,
+        step_size: float,
+        t: int,
+        max_refinement_steps: int = 20,
+    ):
         """
         Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent code
         according to our loss objective until the given threshold is reached for all tokens.
@@ -618,7 +607,8 @@ def _perform_iterative_refinement_step(
 
             # Get max activation value for each subject token
             max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
-                indices=indices, )
+                indices=indices,
+            )
 
             loss = self._compute_loss(max_attention_per_index)
 
@@ -628,9 +618,7 @@ def _perform_iterative_refinement_step(
             logger.info(f"\t Try {iteration}. loss: {loss}")
 
             if iteration >= max_refinement_steps:
-                logger.info(
-                    f"\t Exceeded max number of iterations ({max_refinement_steps})! "
-                )
+                logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ")
                 break
 
         # Run one more time but don't compute gradients and update the latents.
@@ -643,7 +631,8 @@ def _perform_iterative_refinement_step(
 
         # Get max activation value for each subject token
         max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
-            indices=indices, )
+            indices=indices,
+        )
         loss = self._compute_loss(max_attention_per_index)
         logger.info(f"\t Finished with loss of: {loss}")
         return loss, latents, max_attention_per_index
@@ -662,8 +651,7 @@ def register_attention_control(self):
                 continue
 
             cross_att_count += 1
-            attn_procs[name] = AttendExciteAttnProcessor(
-                attnstore=self.attention_store, place_in_unet=place_in_unet)
+            attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet)
 
         self.unet.set_attn_processor(attn_procs)
         self.attention_store.num_att_layers = cross_att_count
@@ -671,42 +659,36 @@ def register_attention_control(self):
     def get_indices(self, prompt: str) -> Dict[str, int]:
         """Utility function to list the indices of the tokens you wish to alte"""
         ids = self.tokenizer(prompt).input_ids
-        indices = {
-            i: tok
-            for tok, i in zip(
-                self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))
-        }
+        indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))}
         return indices
 
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            token_indices: Union[List[int], List[List[int]]],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: int=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            max_iter_to_alter: int=25,
-            thresholds: dict={0: 0.05,
-                              10: 0.5,
-                              20: 0.8},
-            scale_factor: int=20,
-            attn_res: Optional[Tuple[int]]=(16, 16), ):
+        self,
+        prompt: Union[str, List[str]],
+        token_indices: Union[List[int], List[List[int]]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        max_iter_to_alter: int = 25,
+        thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
+        scale_factor: int = 20,
+        attn_res: Optional[Tuple[int]] = (16, 16),
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -802,7 +784,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -824,7 +807,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -839,7 +823,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -853,8 +838,9 @@ def __call__(
         scale_range = np.linspace(1.0, 0.5, len(self.scheduler.timesteps))
         step_size = scale_factor * np.sqrt(scale_range)
 
-        text_embeddings = (prompt_embeds[batch_size * num_images_per_prompt:]
-                           if do_classifier_free_guidance else prompt_embeds)
+        text_embeddings = (
+            prompt_embeds[batch_size * num_images_per_prompt :] if do_classifier_free_guidance else prompt_embeds
+        )
 
         if isinstance(token_indices[0], int):
             token_indices = [token_indices]
@@ -865,8 +851,7 @@ def __call__(
             indices = indices + [ind] * num_images_per_prompt
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # Attend and excite process
@@ -874,8 +859,7 @@ def __call__(
                     latents = latents.clone().detach()
                     latents.stop_gradient = False
                     updated_latents = []
-                    for latent, index, text_embedding in zip(latents, indices,
-                                                             text_embeddings):
+                    for latent, index, text_embedding in zip(latents, indices, text_embeddings):
                         # Forward pass of denoising with text conditioning
                         latent = latent.unsqueeze(0)
                         text_embedding = text_embedding.unsqueeze(0)
@@ -889,28 +873,23 @@ def __call__(
                         self.unet.clear_gradients()
 
                         # Get max activation value for each subject token
-                        max_attention_per_index = (
-                            self._aggregate_and_get_max_attention_per_token(
-                                indices=index, ))
+                        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+                            indices=index,
+                        )
 
-                        loss = self._compute_loss(
-                            max_attention_per_index=max_attention_per_index)
+                        loss = self._compute_loss(max_attention_per_index=max_attention_per_index)
 
                         # If this is an iterative refinement step, verify we have reached the desired threshold for all
-                        if i in thresholds.keys() and loss > 1.0 - thresholds[
-                                i]:
-                            (
-                                loss,
-                                latent,
-                                max_attention_per_index,
-                            ) = self._perform_iterative_refinement_step(
+                        if i in thresholds.keys() and loss > 1.0 - thresholds[i]:
+                            (loss, latent, max_attention_per_index,) = self._perform_iterative_refinement_step(
                                 latents=latent,
                                 indices=index,
                                 loss=loss,
                                 threshold=thresholds[i],
                                 text_embeddings=text_embedding,
                                 step_size=step_size[i],
-                                t=t, )
+                                t=t,
+                            )
 
                         # Perform gradient update
                         if i < max_iter_to_alter:
@@ -918,41 +897,36 @@ def __call__(
                                 latent = self._update_latent(
                                     latents=latent,
                                     loss=loss,
-                                    step_size=step_size[i], )
-                            logger.info(
-                                f"Iteration {i} | Loss: {loss.item():0.4f}")
+                                    step_size=step_size[i],
+                                )
+                            logger.info(f"Iteration {i} | Loss: {loss.item():0.4f}")
 
                         updated_latents.append(latent)
 
                     latents = paddle.concat(updated_latents, axis=0)
 
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -961,8 +935,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 10. Convert to PIL
         if output_type == "pil":
@@ -971,8 +944,7 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
 
 class GaussianSmoothing(nn.Layer):
@@ -989,11 +961,12 @@ class GaussianSmoothing(nn.Layer):
 
     # channels=1, kernel_size=kernel_size, sigma=sigma, dim=2
     def __init__(
-            self,
-            channels: int=1,
-            kernel_size: int=3,
-            sigma: float=0.5,
-            dim: int=2, ):
+        self,
+        channels: int = 1,
+        kernel_size: int = 3,
+        sigma: float = 0.5,
+        dim: int = 2,
+    ):
         super().__init__()
 
         if isinstance(kernel_size, int):
@@ -1004,21 +977,17 @@ def __init__(
         # The gaussian kernel is the product of the
         # gaussian function of each dimension.
         kernel = 1
-        meshgrids = paddle.meshgrid([
-            paddle.arange(
-                size, dtype=paddle.float32) for size in kernel_size
-        ])
+        meshgrids = paddle.meshgrid([paddle.arange(size, dtype=paddle.float32) for size in kernel_size])
         for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
             mean = (size - 1) / 2
-            kernel *= (1 / (std * math.sqrt(2 * math.pi)) *
-                       paddle.exp(-(((mgrid - mean) / (2 * std))**2)))
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * paddle.exp(-(((mgrid - mean) / (2 * std)) ** 2))
 
         # Make sure sum of values in gaussian kernel equals 1.
         kernel = kernel / paddle.sum(kernel)
 
         # Reshape to depthwise convolutional weight
         kernel = kernel.reshape([1, 1, *kernel.shape])
-        kernel = kernel.tile([channels, * [1] * (kernel.ndim - 1)])
+        kernel = kernel.tile([channels, *[1] * (kernel.ndim - 1)])
 
         self.register_buffer("weight", kernel)
         self.groups = channels
@@ -1030,9 +999,7 @@ def __init__(
         elif dim == 3:
             self.conv = F.conv3d
         else:
-            raise RuntimeError(
-                "Only 1, 2 and 3 dimensions are supported. Received {}.".format(
-                    dim))
+            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
 
     def forward(self, input):
         """
@@ -1042,5 +1009,4 @@ def forward(self, input):
         Returns:
             filtered (paddle.Tensor): Filtered output.
         """
-        return self.conv(
-            input, weight=self.weight.cast(input.dtype), groups=self.groups)
+        return self.conv(input, weight=self.weight.cast(input.dtype), groups=self.groups)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index c46f6b8e52147..448660c4ef7c3 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -21,16 +21,14 @@
 import paddle
 import paddle.nn as nn
 import PIL.Image
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.controlnet import ControlNetOutput
 from ...models.modeling_utils import ModelMixin
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -88,28 +86,25 @@ class MultiControlNetModel(ModelMixin):
             `ControlNetModel` as a list.
     """
 
-    def __init__(
-            self,
-            controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+    def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
         super().__init__()
         self.nets = nn.LayerList(controlnets)
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            encoder_hidden_states: paddle.Tensor,
-            controlnet_cond: List[paddle.Tensor],
-            conditioning_scale: List[float],
-            class_labels: Optional[paddle.Tensor]=None,
-            timestep_cond: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            guess_mode: bool=False,
-            return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]:
-        for i, (
-                image, scale, controlnet
-        ) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        controlnet_cond: List[paddle.Tensor],
+        conditioning_scale: List[float],
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
             down_samples, mid_sample = controlnet(
                 sample,
                 timestep,
@@ -121,7 +116,8 @@ def forward(
                 attention_mask,
                 cross_attention_kwargs,
                 guess_mode,
-                return_dict, )
+                return_dict,
+            )
 
             # merge samples
             if i == 0:
@@ -129,16 +125,14 @@ def forward(
             else:
                 down_block_res_samples = [
                     samples_prev + samples_curr
-                    for samples_prev, samples_curr in zip(
-                        down_block_res_samples, down_samples)
+                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
                 ]
                 mid_block_res_sample += mid_sample
 
         return down_block_res_samples, mid_block_res_sample
 
 
-class StableDiffusionControlNetPipeline(DiffusionPipeline,
-                                        TextualInversionLoaderMixin):
+class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -174,17 +168,22 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[
-                ControlNetModel], MultiControlNetModel, ],
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[
+            ControlNetModel,
+            List[ControlNetModel],
+            Tuple[ControlNetModel],
+            MultiControlNetModel,
+        ],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -214,8 +213,9 @@ def __init__(
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_vae_slicing(self):
@@ -250,13 +250,14 @@ def disable_vae_tiling(self):
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
@@ -295,32 +296,36 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            config = (self.text_encoder.config
-                      if isinstance(self.text_encoder.config, dict) else
-                      self.text_encoder.config.to_dict())
-            if (config.get("use_attention_mask", None) is not None and
-                    config["use_attention_mask"]):
+            config = (
+                self.text_encoder.config
+                if isinstance(self.text_encoder.config, dict)
+                else self.text_encoder.config.to_dict()
+            )
+            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -328,8 +333,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -339,21 +343,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -361,50 +366,48 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            config = (self.text_encoder.config
-                      if isinstance(self.text_encoder.config, dict) else
-                      self.text_encoder.config.to_dict())
-            if (config.get("use_attention_mask", None) is not None and
-                    config["use_attention_mask"]):
+            config = (
+                self.text_encoder.config
+                if isinstance(self.text_encoder.config, dict)
+                else self.text_encoder.config.to_dict()
+            )
+            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                dtype=self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -425,55 +428,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None,
-            controlnet_conditioning_scale=1.0, ):
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -486,7 +485,8 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         # `prompt` needs more sophisticated handling when there are multiple
         # conditionings.
@@ -502,15 +502,12 @@ def check_inputs(
             self.check_image(image, prompt, prompt_embeds)
         elif isinstance(self.controlnet, MultiControlNetModel):
             if not isinstance(image, list):
-                raise TypeError(
-                    "For multiple controlnets: `image` must be type `list`")
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
 
             # When `image` is a nested list:
             # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
             elif any(isinstance(i, list) for i in image):
-                raise ValueError(
-                    "A single batch of multiple conditionings are supported at the moment."
-                )
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
             elif len(image) != len(self.controlnet.nets):
                 raise ValueError(
                     "For multiple controlnets: `image` must have the same length as the number of controlnets."
@@ -524,35 +521,28 @@ def check_inputs(
         # Check `controlnet_conditioning_scale`
         if isinstance(self.controlnet, ControlNetModel):
             if not isinstance(controlnet_conditioning_scale, float):
-                raise TypeError(
-                    "For single controlnet: `controlnet_conditioning_scale` must be type `float`."
-                )
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
         elif isinstance(self.controlnet, MultiControlNetModel):
             if isinstance(controlnet_conditioning_scale, list):
-                if any(
-                        isinstance(i, list)
-                        for i in controlnet_conditioning_scale):
-                    raise ValueError(
-                        "A single batch of multiple conditionings are supported at the moment."
-                    )
-            elif isinstance(controlnet_conditioning_scale, list) and len(
-                    controlnet_conditioning_scale) != len(self.controlnet.nets):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
                 raise ValueError(
                     "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                    " the same length as the number of controlnets")
+                    " the same length as the number of controlnets"
+                )
         else:
             assert False
 
     def check_image(self, image, prompt, prompt_embeds):
         image_is_pil = isinstance(image, PIL.Image.Image)
         image_is_tensor = isinstance(image, paddle.Tensor)
-        image_is_pil_list = isinstance(image, list) and isinstance(
-            image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(
-            image[0], paddle.Tensor)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
 
-        if (not image_is_pil and not image_is_tensor and
-                not image_is_pil_list and not image_is_tensor_list):
+        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
             raise TypeError(
                 "image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors"
             )
@@ -579,15 +569,16 @@ def check_image(self, image, prompt, prompt_embeds):
             )
 
     def prepare_image(
-            self,
-            image,
-            width,
-            height,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            do_classifier_free_guidance=False,
-            guess_mode=False, ):
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
         if not isinstance(image, paddle.Tensor):
             if isinstance(image, PIL.Image.Image):
                 image = [image]
@@ -596,8 +587,7 @@ def prepare_image(
                 images = []
                 for image_ in image:
                     image_ = image_.convert("RGB")
-                    image_ = image_.resize(
-                        (width, height), resample=PIL_INTERPOLATION["lanczos"])
+                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
                     image_ = np.array(image_)
                     image_ = image_[None, :]
                     images.append(image_)
@@ -627,14 +617,15 @@ def prepare_image(
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -678,48 +669,47 @@ def _default_height_width(self, height, width, image):
 
     # override DiffusionPipeline
     def save_pretrained(
-            self,
-            save_directory: Union[str, os.PathLike],
-            safe_serialization: bool=False,
-            variant: Optional[str]=None,
-            to_diffusers: bool=None, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        to_diffusers: bool = None,
+    ):
         if isinstance(self.controlnet, ControlNetModel):
             super().save_pretrained(
                 save_directory,
                 safe_serialization=safe_serialization,
                 variant=variant,
-                to_diffusers=to_diffusers, )
-        else:
-            raise NotImplementedError(
-                "Currently, the `save_pretrained()` is not implemented for Multi-ControlNet."
+                to_diffusers=to_diffusers,
             )
+        else:
+            raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
 
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor],
-                         List[PIL.Image.Image]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            controlnet_conditioning_scale: Union[float, List[float]]=1.0,
-            guess_mode: bool=False, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor], List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
@@ -813,7 +803,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
-            controlnet_conditioning_scale, )
+            controlnet_conditioning_scale,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -828,10 +819,8 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(
-                controlnet_conditioning_scale, float):
-            controlnet_conditioning_scale = [controlnet_conditioning_scale
-                                             ] * len(self.controlnet.nets)
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
 
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
@@ -840,7 +829,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare image
         if isinstance(self.controlnet, ControlNetModel):
@@ -852,7 +842,8 @@ def __call__(
                 num_images_per_prompt=num_images_per_prompt,
                 dtype=self.controlnet.dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode, )
+                guess_mode=guess_mode,
+            )
         elif isinstance(self.controlnet, MultiControlNetModel):
             images = []
 
@@ -865,7 +856,8 @@ def __call__(
                     num_images_per_prompt=num_images_per_prompt,
                     dtype=self.controlnet.dtype,
                     do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode, )
+                    guess_mode=guess_mode,
+                )
 
                 images.append(image_)
 
@@ -886,21 +878,19 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # controlnet(s) inference
                 if guess_mode and do_classifier_free_guidance:
@@ -919,20 +909,17 @@ def __call__(
                     controlnet_cond=image,
                     conditioning_scale=controlnet_conditioning_scale,
                     guess_mode=guess_mode,
-                    return_dict=False, )
+                    return_dict=False,
+                )
 
                 if guess_mode and do_classifier_free_guidance:
                     # Infered ControlNet only for the conditional batch.
                     # To apply the output of ControlNet to both the unconditional and conditional batches,
                     # add 0 to the unconditional batch to keep it unchanged.
-                    down_block_res_samples = [
-                        paddle.concat([paddle.zeros_like(d), d])
-                        for d in down_block_res_samples
-                    ]
-                    mid_block_res_sample = paddle.concat([
-                        paddle.zeros_like(mid_block_res_sample),
-                        mid_block_res_sample
-                    ])
+                    down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = paddle.concat(
+                        [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
+                    )
 
                 # predict the noise residual
                 noise_pred = self.unet(
@@ -941,22 +928,19 @@ def __call__(
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
                     down_block_additional_residuals=down_block_res_samples,
-                    mid_block_additional_residual=mid_block_res_sample, ).sample
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -969,8 +953,7 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
@@ -979,11 +962,9 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 4a517f2085671..9bbe0ba73588b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -20,8 +20,12 @@
 import paddle
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPTextModel, CLIPTokenizer,
-                                    DPTForDepthEstimation, DPTImageProcessor)
+from paddlenlp.transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    DPTForDepthEstimation,
+    DPTImageProcessor,
+)
 
 from ...configuration_utils import FrozenDict
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
@@ -44,11 +48,7 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -59,8 +59,7 @@ def preprocess(image):
     return image
 
 
-class StableDiffusionDepth2ImgPipeline(
-        DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.
 
@@ -90,22 +89,21 @@ class StableDiffusionDepth2ImgPipeline(
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            depth_estimator: DPTForDepthEstimation,
-            feature_extractor: DPTImageProcessor, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        depth_estimator: DPTForDepthEstimation,
+        feature_extractor: DPTImageProcessor,
+    ):
         super().__init__()
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -116,12 +114,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -133,18 +128,20 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             depth_estimator=depth_estimator,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -184,29 +181,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -214,8 +213,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -225,21 +223,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -247,47 +246,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -308,52 +303,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            strength,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -366,27 +358,21 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        num_images_per_prompt,
-                        dtype,
-                        generator=None):
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -403,8 +389,7 @@ def prepare_latents(self,
 
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(init_latents, axis=0)
         else:
@@ -412,8 +397,7 @@ def prepare_latents(self,
 
         init_latents = self.vae.config.scaling_factor * init_latents
 
-        if (batch_size > init_latents.shape[0] and
-                batch_size % init_latents.shape[0] == 0):
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
             deprecation_message = (
                 f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -425,12 +409,11 @@ def prepare_latents(self,
                 "len(prompt) != len(image)",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat(
-                [init_latents] * additional_image_per_prompt, axis=0)
-        elif (batch_size > init_latents.shape[0] and
-              batch_size % init_latents.shape[0] != 0):
+            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -446,8 +429,7 @@ def prepare_latents(self,
 
         return latents
 
-    def prepare_depth_map(self, image, depth_map, batch_size,
-                          do_classifier_free_guidance, dtype):
+    def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype):
         if isinstance(image, PIL.Image.Image):
             image = [image]
         else:
@@ -459,27 +441,24 @@ def prepare_depth_map(self, image, depth_map, batch_size,
             height, width = image[0].shape[-2:]
 
         if depth_map is None:
-            pixel_values = self.feature_extractor(
-                images=image, return_tensors="pd").pixel_values
+            pixel_values = self.feature_extractor(images=image, return_tensors="pd").pixel_values
             # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
             # TODO DPTModel `expand_as`` donot supoort float16
             with paddle.amp.auto_cast(True, level="O2"):
-                depth_map = self.depth_estimator(
-                    pixel_values).predicted_depth.cast("float32")
+                depth_map = self.depth_estimator(pixel_values).predicted_depth.cast("float32")
         else:
             depth_map = depth_map.cast("float32")
 
         depth_map = paddle.nn.functional.interpolate(
             depth_map.unsqueeze(1),
-            size=(height // self.vae_scale_factor,
-                  width // self.vae_scale_factor),
+            size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
             mode="bicubic",
-            align_corners=False, )
+            align_corners=False,
+        )
         # amin / amax donot support float16
         depth_min = paddle.amin(depth_map, axis=[1, 2, 3], keepdim=True)
         depth_max = paddle.amax(depth_map, axis=[1, 2, 3], keepdim=True)
-        depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min
-                                                     ) - 1.0
+        depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
         # maybe cast to float16
         depth_map = depth_map.cast(dtype)
 
@@ -488,30 +467,29 @@ def prepare_depth_map(self, image, depth_map, batch_size,
             repeat_by = batch_size // depth_map.shape[0]
             depth_map = depth_map.tile([repeat_by, 1, 1, 1])
 
-        depth_map = (paddle.concat([depth_map] * 2)
-                     if do_classifier_free_guidance else depth_map)
+        depth_map = paddle.concat([depth_map] * 2) if do_classifier_free_guidance else depth_map
         return depth_map
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            depth_map: Optional[paddle.Tensor]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        depth_map: Optional[paddle.Tensor] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -603,7 +581,8 @@ def __call__(
             callback_steps,
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         if image is None:
             raise ValueError("`image` input cannot be undefined.")
@@ -627,7 +606,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare depth mask
         depth_mask = self.prepare_depth_map(
@@ -635,17 +615,16 @@ def __call__(
             depth_map,
             batch_size * num_images_per_prompt,
             do_classifier_free_guidance,
-            prompt_embeds.dtype, )
+            prompt_embeds.dtype,
+        )
 
         # 5. Preprocess image
         image = preprocess(image)
 
         # 6. Set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 7. Prepare latent variables
         latents = self.prepare_latents(
@@ -654,44 +633,35 @@ def __call__(
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                latent_model_input = paddle.concat(
-                    [latent_model_input, depth_mask], axis=1)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = paddle.concat([latent_model_input, depth_mask], axis=1)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input, t,
-                    encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 latents = latents.cast(prompt_embeds.dtype)
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -704,6 +674,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 87ea9a04eb5f6..48556ee9e0bfb 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -19,8 +19,7 @@
 import paddle
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor,
-                                    CLIPVisionModelWithProjection)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -62,14 +61,15 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
     _optional_components = ["safety_checker"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            image_encoder: CLIPVisionModelWithProjection,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -88,12 +88,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -104,12 +102,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -120,17 +115,16 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    def _encode_image(self, image, num_images_per_prompt,
-                      do_classifier_free_guidance):
+    def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance):
         dtype = self.image_encoder.dtype
 
         if not isinstance(image, paddle.Tensor):
-            image = self.feature_extractor(
-                images=image, return_tensors="pd").pixel_values
+            image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
 
         image = image.cast(dtype)
         image_embeddings = self.image_encoder(image).image_embeds
@@ -139,8 +133,7 @@ def _encode_image(self, image, num_images_per_prompt,
         # duplicate image embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = image_embeddings.shape
         image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             negative_prompt_embeds = paddle.zeros_like(image_embeddings)
@@ -148,19 +141,17 @@ def _encode_image(self, image, num_images_per_prompt,
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            image_embeddings = paddle.concat(
-                [negative_prompt_embeds, image_embeddings])
+            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
 
         return image_embeddings
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -181,54 +172,56 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(self, image, height, width, callback_steps):
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}")
+                f" {type(image)}"
+            )
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -244,21 +237,21 @@ def prepare_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -332,8 +325,7 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input image
-        image_embeddings = self._encode_image(image, num_images_per_prompt,
-                                              do_classifier_free_guidance)
+        image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance)
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -348,42 +340,33 @@ def __call__(
             width,
             image_embeddings.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=image_embeddings).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -392,8 +375,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(
-            image, image_embeddings.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype)
 
         # 10. Convert to PIL
         if output_type == "pil":
@@ -402,5 +384,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index b26c0e76369b2..d8bee685bc963 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -20,17 +20,20 @@
 import paddle
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
-                        TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (PIL_INTERPOLATION, deprecate, logging, randn_tensor,
-                      replace_example_docstring)
+from ...utils import (
+    PIL_INTERPOLATION,
+    deprecate,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -74,11 +77,7 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -89,9 +88,7 @@ def preprocess(image):
     return image
 
 
-class StableDiffusionImg2ImgPipeline(DiffusionPipeline,
-                                     TextualInversionLoaderMixin,
-                                     LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.
 
@@ -130,37 +127,33 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline,
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -168,11 +161,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -193,12 +182,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -209,12 +196,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -226,22 +210,24 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.register_to_config(
-            requires_safety_checker=requires_safety_checker, )
+            requires_safety_checker=requires_safety_checker,
+        )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -281,36 +267,37 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
 
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -320,21 +307,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -342,36 +330,33 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -380,17 +365,14 @@ def run_safety_checker(self, image, dtype):
             has_nsfw_concept = None
         else:
             if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(
-                    image, output_type="pil")
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
             else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(
-                    image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="pd")
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
                 images=image,
-                clip_input=paddle.cast(safety_checker_input.pixel_values,
-                                       dtype), )
+                clip_input=paddle.cast(safety_checker_input.pixel_values, dtype),
+            )
         return image, has_nsfw_concept
 
     def decode_latents(self, latents):
@@ -406,51 +388,48 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            strength,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -463,29 +442,21 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self,
-                        image,
-                        timestep,
-                        batch_size,
-                        num_images_per_prompt,
-                        dtype,
-                        generator=None):
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
         if not isinstance(image, (paddle.Tensor, list)):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}"
-            )
+            raise ValueError(f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}")
 
         image = image.cast(dtype)
 
@@ -498,8 +469,7 @@ def prepare_latents(self,
 
         if isinstance(generator, list):
             init_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
             ]
             init_latents = paddle.concat(init_latents, axis=0)
         else:
@@ -507,8 +477,7 @@ def prepare_latents(self,
 
         init_latents = self.vae.config.scaling_factor * init_latents
 
-        if (batch_size > init_latents.shape[0] and
-                batch_size % init_latents.shape[0] == 0):
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
             # expand init_latents for batch_size
             deprecation_message = (
                 f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
@@ -520,12 +489,11 @@ def prepare_latents(self,
                 "len(prompt) != len(image)",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat(
-                [init_latents] * additional_image_per_prompt, axis=0)
-        elif (batch_size > init_latents.shape[0] and
-              batch_size % init_latents.shape[0] != 0):
+            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -544,24 +512,24 @@ def prepare_latents(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -637,7 +605,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -658,17 +627,16 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Preprocess image
         image = self.image_processor.preprocess(image)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
         latents = self.prepare_latents(
@@ -677,51 +645,45 @@ def __call__(
             batch_size,
             num_images_per_prompt,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if not output_type == "latent":
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
         else:
             image = latents
             has_nsfw_concept = None
@@ -731,11 +693,9 @@ def __call__(
         else:
             do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
 
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index fb09dc473b674..f1e0347160085 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -21,8 +21,7 @@
 import paddle.nn.functional as F
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -65,14 +64,11 @@ def prepare_mask_and_masked_image(image, mask):
     """
     if isinstance(image, paddle.Tensor):
         if not isinstance(mask, paddle.Tensor):
-            raise TypeError(
-                f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not"
-            )
+            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
 
         # Batch single image
         if image.ndim == 3:
-            assert (image.shape[0] == 3
-                    ), "Image outside a batch should be of shape (3, H, W)"
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
             image = image.unsqueeze(0)
 
         # Batch and add channel dim for single mask
@@ -89,12 +85,9 @@ def prepare_mask_and_masked_image(image, mask):
             else:
                 mask = mask.unsqueeze(1)
 
-        assert (image.ndim == 4 and
-                mask.ndim == 4), "Image and Mask must have 4 dimensions"
-        assert (image.shape[-2:] == mask.shape[-2:]
-                ), "Image and Mask must have the same spatial dimensions"
-        assert (image.shape[0] == mask.shape[0]
-                ), "Image and Mask must have the same batch size"
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
 
         # Check image is in [-1, 1]
         if image.min() < -1 or image.max() > 1:
@@ -110,8 +103,7 @@ def prepare_mask_and_masked_image(image, mask):
         # Image as float32
         image = image.cast(paddle.float32)
     elif isinstance(mask, paddle.Tensor):
-        raise TypeError(
-            f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
+        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
     else:
         # preprocess image
         if isinstance(image, (PIL.Image.Image, np.ndarray)):
@@ -131,8 +123,7 @@ def prepare_mask_and_masked_image(image, mask):
             mask = [mask]
 
         if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
-            mask = np.concatenate(
-                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
             mask = mask.astype(np.float32) / 255.0
         elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
             mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
@@ -176,49 +167,47 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPFeatureExtractor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "skip_prk_steps") and
-                scheduler.config.skip_prk_steps is False):
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration"
                 " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
                 " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
                 " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
                 " Hub, it would be very nice if you could open a Pull request for the"
-                " `scheduler/scheduler_config.json` file")
+                " `scheduler/scheduler_config.json` file"
+            )
             deprecate(
                 "skip_prk_steps not set",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             new_config = dict(scheduler.config)
             new_config["skip_prk_steps"] = True
             scheduler._internal_dict = FrozenDict(new_config)
@@ -239,12 +228,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -255,12 +242,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -272,19 +256,21 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -320,29 +306,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -350,8 +338,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -361,14 +348,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -378,47 +367,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -430,15 +415,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -454,39 +437,37 @@ def decode_latents(self, latents):
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -499,18 +480,20 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -531,22 +514,20 @@ def prepare_latents(
         return latents
 
     def prepare_mask_latents(
-            self,
-            mask,
-            masked_image,
-            batch_size,
-            height,
-            width,
-            dtype,
-            generator,
-            do_classifier_free_guidance, ):
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+        generator,
+        do_classifier_free_guidance,
+    ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
-        mask = F.interpolate(
-            mask,
-            size=(height // self.vae_scale_factor,
-                  width // self.vae_scale_factor))
+        mask = F.interpolate(mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor))
         mask = mask.cast(dtype)
 
         masked_image = masked_image.cast(dtype)
@@ -554,13 +535,12 @@ def prepare_mask_latents(
         # encode the mask image into latents space so we can concatenate it to the latents
         if isinstance(generator, list):
             masked_image_latents = [
-                self.vae.encode(masked_image[i:i + 1]).latent_dist.sample(
-                    generator=generator[i]) for i in range(batch_size)
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
             ]
             masked_image_latents = paddle.concat(masked_image_latents, axis=0)
         else:
-            masked_image_latents = self.vae.encode(
-                masked_image).latent_dist.sample(generator=generator)
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
         masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
 
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
@@ -579,14 +559,12 @@ def prepare_mask_latents(
                     f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
                     " Make sure the number of images that you pass is divisible by the total requested batch size."
                 )
-            masked_image_latents = masked_image_latents.tile(
-                [batch_size // masked_image_latents.shape[0], 1, 1, 1])
+            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
 
-        mask = paddle.concat([mask] *
-                             2) if do_classifier_free_guidance else mask
-        masked_image_latents = (paddle.concat([masked_image_latents] * 2)
-                                if do_classifier_free_guidance else
-                                masked_image_latents)
+        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
 
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.cast(dtype)
@@ -594,26 +572,26 @@ def prepare_mask_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -726,7 +704,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         if image is None:
             raise ValueError("`image` input cannot be undefined.")
@@ -754,7 +733,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Preprocess mask and image
         mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
@@ -772,7 +752,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Prepare mask latent variables
         mask, masked_image_latents = self.prepare_mask_latents(
@@ -783,60 +764,51 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            do_classifier_free_guidance, )
+            do_classifier_free_guidance,
+        )
 
         # 8. Check that sizes of mask, masked image and latents match
         num_channels_mask = mask.shape[1]
         num_channels_masked_image = masked_image_latents.shape[1]
-        if (num_channels_latents + num_channels_mask + num_channels_masked_image
-                != self.unet.config.in_channels):
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
             raise ValueError(
                 f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                 f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                 f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
                 f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                " `pipeline.unet` or your `mask_image` or `image` input.")
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
 
         # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 10. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
 
                 # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                latent_model_input = paddle.concat(
-                    [latent_model_input, mask, masked_image_latents], axis=1)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input, t,
-                    encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # must cast dtype, paddle.concat has bug....
                 latents = latents.cast(prompt_embeds.dtype)
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -845,8 +817,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 12. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 13. Convert to PIL
         if output_type == "pil":
@@ -855,5 +826,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 67150c534019e..e321d55a86336 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -21,12 +21,10 @@
 import paddle.nn.functional as F
 import PIL
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...loaders import (FromCkptMixin, LoraLoaderMixin,
-                        TextualInversionLoaderMixin)
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
@@ -54,7 +52,8 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
         mask = mask.resize(
             (w // scale_factor, h // scale_factor),
-            resample=PIL_INTERPOLATION["nearest"], )
+            resample=PIL_INTERPOLATION["nearest"],
+        )
         mask = np.array(mask).astype(np.float32) / 255.0
         mask = np.tile(mask, (4, 1, 1))
         mask = np.vstack([mask[None]] * batch_size)
@@ -70,7 +69,8 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
         elif mask.shape[1] not in valid_mask_channel_sizes:
             raise ValueError(
                 f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
-                f" but received mask of shape {tuple(mask.shape)}")
+                f" but received mask of shape {tuple(mask.shape)}"
+            )
         # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
         mask = mask.mean(1, keepdim=True)
         h, w = mask.shape[-2:]
@@ -79,9 +79,9 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
         return mask
 
 
-class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline,
-                                           TextualInversionLoaderMixin,
-                                           LoraLoaderMixin, FromCkptMixin):
+class StableDiffusionInpaintPipelineLegacy(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
+):
     r"""
     Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
 
@@ -119,37 +119,33 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline,
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -157,11 +153,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -182,12 +174,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -198,12 +188,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -215,19 +202,21 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -267,29 +256,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -297,8 +288,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -308,21 +298,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -330,47 +321,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -391,52 +378,49 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            strength,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if strength < 0 or strength > 1:
-            raise ValueError(
-                f"The value of strength should in [0.0, 1.0] but is {strength}")
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -449,59 +433,56 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength):
         # get the original timestep using init_timestep
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self, image, timestep, num_images_per_prompt, dtype,
-                        generator):
+    def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, generator):
         image = image.cast(dtype)
         init_latent_dist = self.vae.encode(image).latent_dist
         init_latents = init_latent_dist.sample(generator=generator)
         init_latents = self.vae.config.scaling_factor * init_latents
 
         # Expand init_latents for batch_size and num_images_per_prompt
-        init_latents = paddle.concat(
-            [init_latents] * num_images_per_prompt, axis=0)
+        init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0)
         init_latents_orig = init_latents
 
         # add noise to latents using the timesteps
-        noise = randn_tensor(
-            init_latents.shape, generator=generator, dtype=dtype)
+        noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype)
         init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
         latents = init_latents
         return latents, init_latents_orig, noise
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            add_predicted_noise: Optional[bool]=False,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -580,7 +561,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -602,21 +584,19 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Preprocess image and mask
         if not isinstance(image, paddle.Tensor):
             image = preprocess_image(image, batch_size)
 
-        mask_image = preprocess_mask(mask_image, batch_size,
-                                     self.vae_scale_factor)
+        mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps,
-                                                            strength)
-        latent_timestep = timesteps[:1].tile(
-            [batch_size * num_images_per_prompt])
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
 
         # 6. Prepare latent variables
         # encode the init image into latents and scale the latents
@@ -625,7 +605,8 @@ def __call__(
             latent_timestep,
             num_images_per_prompt,
             prompt_embeds.dtype,
-            generator, )
+            generator,
+        )
 
         # 7. Prepare mask latent
         mask = mask_image.cast(latents.dtype)
@@ -635,50 +616,39 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input, t,
-                    encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 if i < len(timesteps) - 1:
                     # masking
                     if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_orig, noise_pred_uncond, t)
+                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise_pred_uncond, t)
                     else:
                         # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
                         noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_orig, noise, noise_timestep)
+                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, noise_timestep)
                 else:
                     init_latents_proper = init_latents_orig
 
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -687,8 +657,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 12. Convert to PIL
         if output_type == "pil":
@@ -697,5 +666,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index f39e50878b44e..02b3128d40d82 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -18,8 +18,7 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -43,11 +42,7 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -58,8 +53,7 @@ def preprocess(image):
     return image
 
 
-class StableDiffusionInstructPix2PixPipeline(
-        DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion.
 
@@ -95,15 +89,16 @@ class StableDiffusionInstructPix2PixPipeline(
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -129,30 +124,31 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            num_inference_steps: int=100,
-            guidance_scale: float=7.5,
-            image_guidance_scale: float=1.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 7.5,
+        image_guidance_scale: float = 1.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -252,7 +248,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         if image is None:
             raise ValueError("`image` input cannot be undefined.")
@@ -268,8 +265,7 @@ def __call__(
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = (guidance_scale > 1.0 and
-                                       image_guidance_scale >= 1.0)
+        do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
         # check if scheduler is in sigmas space
         scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
 
@@ -280,7 +276,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 3. Preprocess image
         image = preprocess(image)
@@ -297,7 +294,8 @@ def __call__(
             num_images_per_prompt,
             prompt_embeds.dtype,
             do_classifier_free_guidance,
-            generator, )
+            generator,
+        )
 
         # 6. Prepare latent variables
         num_channels_latents = self.vae.config.latent_channels
@@ -308,7 +306,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Check that shapes of latents and image match the UNet channels
         num_channels_image = image_latents.shape[1]
@@ -318,45 +317,40 @@ def __call__(
                 f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                 f" `num_channels_image`: {num_channels_image} "
                 f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input.")
+                " `pipeline.unet` or your `image` input."
+            )
 
         # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # Expand the latents if we are doing classifier free guidance.
                 # The latents are expanded 3 times because for pix2pix the guidance\
                 # is applied for both the text and the input image.
-                latent_model_input = (paddle.concat([latents] * 3) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 3) if do_classifier_free_guidance else latents
 
                 # concat latents, image_latents in the channel dimension
-                scaled_latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 scaled_latent_model_input = paddle.concat(
                     [
                         scaled_latent_model_input,
                         image_latents.cast(scaled_latent_model_input.dtype),
                     ],
-                    axis=1, )
+                    axis=1,
+                )
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    scaled_latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
 
                 # Hack:
                 # For karras style schedulers the model does classifer free guidance using the
                 # predicted_original_sample instead of the noise_pred. So we need to compute the
                 # predicted_original_sample here if we are using a karras style scheduler.
                 if scheduler_is_in_sigma_space:
-                    step_index = (
-                        self.scheduler.timesteps == t).nonzero().item()
+                    step_index = (self.scheduler.timesteps == t).nonzero().item()
                     sigma = self.scheduler.sigmas[step_index]
                     noise_pred = latent_model_input - sigma * noise_pred
 
@@ -365,11 +359,13 @@ def __call__(
                     (
                         noise_pred_text,
                         noise_pred_image,
-                        noise_pred_uncond, ) = noise_pred.chunk(3)
-                    noise_pred = (noise_pred_uncond + guidance_scale *
-                                  (noise_pred_text - noise_pred_image
-                                   ) + image_guidance_scale *
-                                  (noise_pred_image - noise_pred_uncond))
+                        noise_pred_uncond,
+                    ) = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + guidance_scale * (noise_pred_text - noise_pred_image)
+                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
 
                 # Hack:
                 # For karras style schedulers the model does classifer free guidance using the
@@ -381,13 +377,10 @@ def __call__(
                     noise_pred = (noise_pred - latents) / (-sigma)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -396,8 +389,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 12. Convert to PIL
         if output_type == "pil":
@@ -406,17 +398,17 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -456,29 +448,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -486,8 +480,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -497,21 +490,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -519,49 +513,44 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
-            prompt_embeds = paddle.concat([
-                prompt_embeds, negative_prompt_embeds, negative_prompt_embeds
-            ])
+            prompt_embeds = paddle.concat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -573,15 +562,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -596,32 +583,32 @@ def decode_latents(self, latents):
         return image
 
     def check_inputs(
-            self,
-            prompt,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -634,23 +621,26 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -665,13 +655,14 @@ def prepare_latents(
         return latents
 
     def prepare_image_latents(
-            self,
-            image,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            do_classifier_free_guidance,
-            generator=None, ):
+        self,
+        image,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        do_classifier_free_guidance,
+        generator=None,
+    ):
         if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
@@ -687,16 +678,12 @@ def prepare_image_latents(
             )
 
         if isinstance(generator, list):
-            image_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.mode()
-                for i in range(batch_size)
-            ]
+            image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)]
             image_latents = paddle.concat(image_latents, axis=0)
         else:
             image_latents = self.vae.encode(image).latent_dist.mode()
 
-        if (batch_size > image_latents.shape[0] and
-                batch_size % image_latents.shape[0] == 0):
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
             # expand image_latents for batch_size
             deprecation_message = (
                 f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
@@ -708,12 +695,11 @@ def prepare_image_latents(
                 "len(prompt) != len(image)",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             additional_image_per_prompt = batch_size // image_latents.shape[0]
-            image_latents = paddle.concat(
-                [image_latents] * additional_image_per_prompt, axis=0)
-        elif (batch_size > image_latents.shape[0] and
-              batch_size % image_latents.shape[0] != 0):
+            image_latents = paddle.concat([image_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -722,7 +708,6 @@ def prepare_image_latents(
 
         if do_classifier_free_guidance:
             uncond_image_latents = paddle.zeros_like(image_latents)
-            image_latents = paddle.concat(
-                [image_latents, image_latents, uncond_image_latents], axis=0)
+            image_latents = paddle.concat([image_latents, image_latents, uncond_image_latents], axis=0)
 
         return image_latents
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index f4fdd86cdbfb6..9151849ce7309 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -75,12 +75,13 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: EulerDiscreteScheduler, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: EulerDiscreteScheduler,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -88,10 +89,10 @@ def __init__(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
 
-    def _encode_prompt(self, prompt, do_classifier_free_guidance,
-                       negative_prompt):
+    def _encode_prompt(self, prompt, do_classifier_free_guidance, negative_prompt):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -112,23 +113,25 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
             max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_length=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
 
-        untruncated_ids = self.tokenizer(
-            prompt, padding="longest", return_tensors="pd").input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
 
         text_encoder_out = self.text_encoder(
             text_input_ids,
-            output_hidden_states=True, )
+            output_hidden_states=True,
+        )
         text_embeddings = text_encoder_out.hidden_states[-1]
         text_pooler_out = text_encoder_out.pooler_output
 
@@ -140,14 +143,16 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -158,11 +163,13 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
                 max_length=max_length,
                 truncation=True,
                 return_length=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
             uncond_encoder_out = self.text_encoder(
                 uncond_input.input_ids,
-                output_hidden_states=True, )
+                output_hidden_states=True,
+            )
 
             uncond_embeddings = uncond_encoder_out.hidden_states[-1]
             uncond_pooler_out = uncond_encoder_out.pooler_output
@@ -170,10 +177,8 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance,
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            text_embeddings = paddle.concat(
-                [uncond_embeddings, text_embeddings])
-            text_pooler_out = paddle.concat(
-                [uncond_pooler_out, text_pooler_out])
+            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
+            text_pooler_out = paddle.concat([uncond_pooler_out, text_pooler_out])
 
         return text_embeddings, text_pooler_out
 
@@ -188,13 +193,13 @@ def decode_latents(self, latents):
 
     def check_inputs(self, prompt, image, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
             )
@@ -216,30 +221,30 @@ def check_inputs(self, prompt, image, callback_steps):
                 )
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (batch_size, num_channels_latents, height, width)
         if latents is None:
             latents = randn_tensor(shape, generator=generator, dtype=dtype)
         else:
             if latents.shape != list(shape):
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
@@ -247,19 +252,19 @@ def prepare_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
-            num_inference_steps: int=75,
-            guidance_scale: float=9.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -362,16 +367,14 @@ def __call__(
             prompt = [""] * batch_size
 
         # 3. Encode input prompt
-        text_embeddings, text_pooler_out = self._encode_prompt(
-            prompt, do_classifier_free_guidance, negative_prompt)
+        text_embeddings, text_pooler_out = self._encode_prompt(prompt, do_classifier_free_guidance, negative_prompt)
 
         # 4. Preprocess image
         image = preprocess(image)
         image = image.cast(text_embeddings.dtype)
         if image.shape[1] == 3:
             # encode image if not in latent-space yet
-            image = (self.vae.encode(image).latent_dist.sample() *
-                     self.vae.config.scaling_factor)
+            image = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -386,27 +389,23 @@ def __call__(
         # "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default."
         noise_level = paddle.to_tensor([0.0], dtype=paddle.float32)
         noise_level = paddle.concat([noise_level] * image.shape[0])
-        inv_noise_level = (noise_level**2 + 1)**(-0.5)
+        inv_noise_level = (noise_level**2 + 1) ** (-0.5)
 
         # TODO F.interpolate donot support float16
-        image_cond = (F.interpolate(
-            image.cast("float32"), scale_factor=2,
-            mode="nearest") * inv_noise_level[:, None, None, None])
+        image_cond = (
+            F.interpolate(image.cast("float32"), scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
+        )
         image_cond = image_cond.cast(text_embeddings.dtype)
 
         noise_level_embed = paddle.concat(
             [
-                paddle.ones(
-                    [text_pooler_out.shape[0], 64],
-                    dtype=text_pooler_out.dtype),
-                paddle.zeros(
-                    [text_pooler_out.shape[0], 64],
-                    dtype=text_pooler_out.dtype),
+                paddle.ones([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype),
+                paddle.zeros([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype),
             ],
-            axis=1, )
+            axis=1,
+        )
 
-        timestep_condition = paddle.concat(
-            [noise_level_embed, text_pooler_out], axis=1)
+        timestep_condition = paddle.concat([noise_level_embed, text_pooler_out], axis=1)
 
         # 6. Prepare latent variables
         height, width = image.shape[2:]
@@ -418,7 +417,8 @@ def __call__(
             width * 2,
             text_embeddings.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Check that sizes of image and latents match
         num_channels_image = image.shape[1]
@@ -428,7 +428,8 @@ def __call__(
                 f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                 f" `num_channels_image`: {num_channels_image} "
                 f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input.")
+                " `pipeline.unet` or your `image` input."
+            )
 
         # 9. Denoising loop
         num_warmup_steps = 0
@@ -437,48 +438,39 @@ def __call__(
             for i, t in enumerate(timesteps):
                 sigma = self.scheduler.sigmas[i]
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                scaled_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 scaled_model_input = paddle.concat(
-                    [
-                        scaled_model_input,
-                        image_cond.cast(scaled_model_input.dtype)
-                    ],
-                    axis=1, )
+                    [scaled_model_input, image_cond.cast(scaled_model_input.dtype)],
+                    axis=1,
+                )
                 # preconditioning parameter based on  Karras et al. (2022) (table 1)
                 timestep = paddle.log(sigma) * 0.25
                 noise_pred = self.unet(
                     scaled_model_input,
                     timestep,
                     encoder_hidden_states=text_embeddings,
-                    timestep_cond=timestep_condition, ).sample
+                    timestep_cond=timestep_condition,
+                ).sample
 
                 # in original repo, the output contains a variance channel that's not used
                 noise_pred = noise_pred[:, :-1]
 
                 # apply preconditioning, based on table 1 in Karras et al. (2022)
                 inv_sigma = 1 / (sigma**2 + 1)
-                noise_pred = (
-                    inv_sigma * latent_model_input +
-                    self.scheduler.scale_model_input(sigma, t) * noise_pred)
+                noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t,
-                                              latents).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -491,6 +483,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
index 13e7d28b153ee..93a2487ee267a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
@@ -21,8 +21,9 @@
 from ...utils import logging
 from .pipeline_stable_diffusion import StableDiffusionPipeline
 from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
-from .pipeline_stable_diffusion_inpaint_legacy import \
-    StableDiffusionInpaintPipelineLegacy
+from .pipeline_stable_diffusion_inpaint_legacy import (
+    StableDiffusionInpaintPipelineLegacy,
+)
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -61,36 +62,31 @@ def __call__(self, *args, **kwargs):
         return self.text2img(*args, **kwargs)
 
     def text2img(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
 
-        expected_components = inspect.signature(
-            StableDiffusionPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        expected_components = inspect.signature(StableDiffusionPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = StableDiffusionPipeline(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         output = temp_pipeline(
             prompt=prompt,
             height=height,
@@ -108,38 +104,34 @@ def text2img(
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs, )
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
         return output
 
     def img2img(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
-        expected_components = inspect.signature(
-            StableDiffusionImg2ImgPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        expected_components = inspect.signature(StableDiffusionImg2ImgPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = StableDiffusionImg2ImgPipeline(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         output = temp_pipeline(
             prompt=prompt,
             image=image,
@@ -156,41 +148,37 @@ def img2img(
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs, )
+            **kwargs,
+        )
 
         return output
 
     def inpaint_legacy(
-            self,
-            prompt: Union[str, List[str]],
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            mask_image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            strength: float=0.8,
-            num_inference_steps: Optional[int]=50,
-            guidance_scale: Optional[float]=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            add_predicted_noise: Optional[bool]=False,
-            eta: Optional[float]=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
-        expected_components = inspect.signature(
-            StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        expected_components = inspect.signature(StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = StableDiffusionInpaintPipelineLegacy(
-            **components,
-            requires_safety_checker=self.config.requires_safety_checker)
+            **components, requires_safety_checker=self.config.requires_safety_checker
+        )
         output = temp_pipeline(
             prompt=prompt,
             image=image,
@@ -209,6 +197,7 @@ def inpaint_legacy(
             return_dict=return_dict,
             callback=callback,
             callback_steps=callback_steps,
-            **kwargs, )
+            **kwargs,
+        )
 
         return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index ce7f96b22cc24..3ad5c35785e9a 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -16,8 +16,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -48,8 +47,7 @@
 """
 
 
-class StableDiffusionModelEditingPipeline(DiffusionPipeline,
-                                          TextualInversionLoaderMixin):
+class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
     Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models".
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -80,22 +78,22 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: SchedulerMixin,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True,
-            with_to_k: bool=True,
-            with_augs: list=AUGS_CONST, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        with_to_k: bool = True,
+        with_augs: list = AUGS_CONST,
+    ):
         super().__init__()
 
         if isinstance(scheduler, PNDMScheduler):
-            logger.error(
-                "PNDMScheduler for this pipeline is currently not supported.")
+            logger.error("PNDMScheduler for this pipeline is currently not supported.")
 
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -120,8 +118,9 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
         self.with_to_k = with_to_k
@@ -147,18 +146,12 @@ def append_ca(net_):
                 append_ca(net[1])
 
         # get projection matrices
-        self.ca_clip_layers = [
-            l for l in ca_layers if l.to_v.in_features == 768
-        ]
+        self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768]
         self.projection_matrices = [l.to_v for l in self.ca_clip_layers]
         self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers]
         if self.with_to_k:
-            self.projection_matrices = self.projection_matrices + [
-                l.to_k for l in self.ca_clip_layers
-            ]
-            self.og_matrices = self.og_matrices + [
-                copy.deepcopy(l.to_k) for l in self.ca_clip_layers
-            ]
+            self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers]
+            self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers]
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
     def enable_vae_slicing(self):
@@ -179,13 +172,14 @@ def disable_vae_slicing(self):
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
@@ -224,29 +218,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
@@ -254,8 +250,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -265,21 +260,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -287,47 +283,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                dtype=self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -348,54 +340,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -408,23 +396,26 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -440,11 +431,12 @@ def prepare_latents(
 
     @paddle.no_grad()
     def edit_model(
-            self,
-            source_prompt: str,
-            destination_prompt: str,
-            lamb: float=0.1,
-            restart_params: bool=True, ):
+        self,
+        source_prompt: str,
+        destination_prompt: str,
+        lamb: float = 0.1,
+        restart_params: bool = True,
+    ):
         r"""
         Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084)
         Args:
@@ -467,20 +459,17 @@ def edit_model(
                 l.to_v = copy.deepcopy(self.og_matrices[idx_])
                 self.projection_matrices[idx_] = l.to_v
                 if self.with_to_k:
-                    l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers +
-                                                            idx_])
+                    l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_])
                     self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k
 
         # set up sentences
         old_texts = [source_prompt]
         new_texts = [destination_prompt]
         # add augmentations
-        base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][
-            1:]
+        base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:]
         for aug in self.with_augs:
             old_texts.append(aug + base)
-        base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][
-            1:]
+        base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:]
         for aug in self.with_augs:
             new_texts.append(aug + base)
 
@@ -492,7 +481,8 @@ def edit_model(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_embeddings = self.text_encoder(text_input.input_ids)[0]
             old_emb, new_emb = text_embeddings
             old_embs.append(old_emb)
@@ -504,12 +494,12 @@ def edit_model(
             tokens_a = self.tokenizer(old_text).input_ids
             tokens_b = self.tokenizer(new_text).input_ids
             tokens_a = [
-                self.tokenizer.encode("a ")["input_ids"][1]
-                if self.tokenizer.decode(t) == "an" else t for t in tokens_a
+                self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t
+                for t in tokens_a
             ]
             tokens_b = [
-                self.tokenizer.encode("a ")["input_ids"][1]
-                if self.tokenizer.decode(t) == "an" else t for t in tokens_b
+                self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t
+                for t in tokens_b
             ]
             num_orig_tokens = len(tokens_a)
             idxs_replace = []
@@ -529,8 +519,7 @@ def edit_model(
 
         # prepare batch: for each pair of setences, old context and new values
         contexts, valuess = [], []
-        for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs,
-                                                  idxs_replaces):
+        for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces):
             context = old_emb.detach()
             values = []
             with paddle.no_grad():
@@ -545,52 +534,47 @@ def edit_model(
             mat1 = lamb * self.projection_matrices[layer_num].weight
 
             # mat2 = \lambda I + \sum{k k^T}
-            mat2 = lamb * paddle.eye(self.projection_matrices[layer_num]
-                                     .weight.shape[1])
+            mat2 = lamb * paddle.eye(self.projection_matrices[layer_num].weight.shape[1])
 
             # aggregate sums for mat1, mat2
             for context, values in zip(contexts, valuess):
-                context_vector = context.reshape(
-                    [context.shape[0], context.shape[1], 1])
-                context_vector_T = context.reshape(
-                    [context.shape[0], 1, context.shape[1]])
-                value_vector = values[layer_num].reshape([
-                    values[layer_num].shape[0], values[layer_num].shape[1], 1
-                ])
-                for_mat1 = (value_vector @context_vector_T).sum(axis=0)
-                for_mat2 = (context_vector @context_vector_T).sum(axis=0)
+                context_vector = context.reshape([context.shape[0], context.shape[1], 1])
+                context_vector_T = context.reshape([context.shape[0], 1, context.shape[1]])
+                value_vector = values[layer_num].reshape([values[layer_num].shape[0], values[layer_num].shape[1], 1])
+                for_mat1 = (value_vector @ context_vector_T).sum(axis=0)
+                for_mat2 = (context_vector @ context_vector_T).sum(axis=0)
                 mat1 += for_mat1
                 mat2 += for_mat2
 
             # update projection matrix
-            mat = mat1 @paddle.inverse(mat2)
-            self.projection_matrices[
-                layer_num].weight = paddle.create_parameter(
-                    shape=mat.shape,
-                    dtype=mat.dtype,
-                    default_initializer=paddle.nn.initializer.Assign(mat), )
+            mat = mat1 @ paddle.inverse(mat2)
+            self.projection_matrices[layer_num].weight = paddle.create_parameter(
+                shape=mat.shape,
+                dtype=mat.dtype,
+                default_initializer=paddle.nn.initializer.Assign(mat),
+            )
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
@@ -668,7 +652,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -690,7 +675,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -705,43 +691,38 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -754,8 +735,7 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.numpy_to_pil(image)
@@ -764,11 +744,9 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index cc2586bec5107..5258f174894bf 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -15,8 +15,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -46,8 +45,7 @@
 """
 
 
-class StableDiffusionPanoramaPipeline(DiffusionPipeline,
-                                      TextualInversionLoaderMixin):
+class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
     Pipeline for text-to-image generation using "MultiDiffusion: Fusing Diffusion Paths for Controlled Image
     Generation".
@@ -81,20 +79,20 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: DDIMScheduler,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if isinstance(scheduler, PNDMScheduler):
-            logger.error(
-                "PNDMScheduler for this pipeline is currently not supported.")
+            logger.error("PNDMScheduler for this pipeline is currently not supported.")
 
         if safety_checker is None and requires_safety_checker:
             logger.warning(
@@ -119,19 +117,21 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -171,29 +171,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -201,8 +203,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -212,21 +213,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -234,47 +236,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -295,54 +293,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -355,23 +349,26 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -385,11 +382,7 @@ def prepare_latents(
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    def get_views(self,
-                  panorama_height,
-                  panorama_width,
-                  window_size=64,
-                  stride=8):
+    def get_views(self, panorama_height, panorama_width, window_size=64, stride=8):
         # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
         panorama_height /= 8
         panorama_width /= 8
@@ -408,25 +401,25 @@ def get_views(self,
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=512,
-            width: Optional[int]=2048,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 2048,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -508,7 +501,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -530,7 +524,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -545,7 +540,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Define panorama grid and initialize views for synthesis.
         views = self.get_views(height, width)
@@ -558,8 +554,7 @@ def __call__(
         # 8. Denoising loop
         # Each denoising step also includes refinement of the latents with respect to the
         # views.
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 count.zero_()
@@ -572,44 +567,39 @@ def __call__(
                 # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
                 for h_start, h_end, w_start, w_end in views:
                     # get the latents corresponding to the current view coordinates
-                    latents_for_view = latents[:, :, h_start:h_end, w_start:
-                                               w_end]
+                    latents_for_view = latents[:, :, h_start:h_end, w_start:w_end]
 
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (paddle.concat([latents_for_view] * 2)
-                                          if do_classifier_free_guidance else
-                                          latents_for_view)
-                    latent_model_input = self.scheduler.scale_model_input(
-                        latent_model_input, t)
+                    latent_model_input = (
+                        paddle.concat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                     # predict the noise residual
                     noise_pred = self.unet(
                         latent_model_input,
                         t,
                         encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs, ).sample
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    ).sample
 
                     # perform guidance
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (
-                            noise_pred_text - noise_pred_uncond)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                     # compute the previous noisy sample x_t -> x_t-1
                     latents_view_denoised = self.scheduler.step(
-                        noise_pred, t, latents_for_view,
-                        **extra_step_kwargs).prev_sample
-                    value[:, :, h_start:h_end, w_start:
-                          w_end] += latents_view_denoised
+                        noise_pred, t, latents_for_view, **extra_step_kwargs
+                    ).prev_sample
+                    value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
                     count[:, :, h_start:h_end, w_start:w_end] += 1
 
                 # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
                 latents = paddle.where(count > 0, value / count, value)
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -618,8 +608,7 @@ def __call__(
         image = self.decode_latents(latents)
 
         # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 10. Convert to PIL
         if output_type == "pil":
@@ -628,5 +617,4 @@ def __call__(
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 1ae1d85aacf36..7a5cb8d8a0a5e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -22,19 +22,33 @@
 import paddle.nn.functional as F
 import paddle.optimizer
 import PIL
-from paddlenlp.transformers import (BlipForConditionalGeneration, BlipProcessor,
-                                    CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import (
+    BlipForConditionalGeneration,
+    BlipProcessor,
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import Attention
-from ...schedulers import (DDIMScheduler, DDPMScheduler,
-                           EulerAncestralDiscreteScheduler,
-                           LMSDiscreteScheduler)
+from ...schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+)
 from ...schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
-from ...utils import (PIL_INTERPOLATION, BaseOutput, deprecate, logging,
-                      randint_tensor, randn_tensor, replace_example_docstring)
+from ...utils import (
+    PIL_INTERPOLATION,
+    BaseOutput,
+    deprecate,
+    logging,
+    randint_tensor,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -172,11 +186,7 @@ def preprocess(image):
         w, h = image[0].size
         w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
 
-        image = [
-            np.array(i.resize(
-                (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :]
-            for i in image
-        ]
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -194,13 +204,11 @@ def prepare_unet(unet: UNet2DConditionModel):
         module_name = name.replace(".processor", "")
         module: nn.Layer = unet.get_sublayer(module_name)
         if "attn2" in name:
-            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(
-                is_pix2pix_zero=True)
+            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=True)
             for params in module.parameters():
                 params.stop_gradient = False
         else:
-            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(
-                is_pix2pix_zero=False)
+            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=False)
             for params in module.parameters():
                 params.stop_gradient = True
 
@@ -213,7 +221,7 @@ def __init__(self):
         self.loss = 0.0
 
     def compute_loss(self, predictions, targets):
-        self.loss += ((predictions - targets)**2).sum((1, 2)).mean(0)
+        self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0)
 
 
 class Pix2PixZeroAttnProcessor:
@@ -226,23 +234,22 @@ def __init__(self, is_pix2pix_zero=False):
             self.reference_cross_attn_map = {}
 
     def __call__(
-            self,
-            attn: Attention,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            timestep=None,
-            loss=None, ):
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        timestep=None,
+        loss=None,
+    ):
         batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
@@ -255,14 +262,11 @@ def __call__(
         if self.is_pix2pix_zero and timestep is not None:
             # new bookkeeping to save the attention weights.
             if loss is None:
-                self.reference_cross_attn_map[timestep.item(
-                )] = attention_probs.detach().flatten(0, 1)
+                self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().flatten(0, 1)
             # compute loss
             elif loss is not None:
-                prev_attn_probs = self.reference_cross_attn_map.pop(
-                    timestep.item())
-                loss.compute_loss(
-                    attention_probs.flatten(0, 1), prev_attn_probs)
+                prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item())
+                loss.compute_loss(attention_probs.flatten(0, 1), prev_attn_probs)
 
         hidden_states = paddle.matmul(attention_probs, value)
         hidden_states = attn.batch_to_head_dim(hidden_states)
@@ -314,20 +318,24 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
     ]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: Union[DDPMScheduler, DDIMScheduler,
-                             EulerAncestralDiscreteScheduler,
-                             LMSDiscreteScheduler, ],
-            feature_extractor: CLIPImageProcessor,
-            safety_checker: StableDiffusionSafetyChecker,
-            inverse_scheduler: DDIMInverseScheduler,
-            caption_generator: BlipForConditionalGeneration,
-            caption_processor: BlipProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDPMScheduler,
+            DDIMScheduler,
+            EulerAncestralDiscreteScheduler,
+            LMSDiscreteScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor,
+        safety_checker: StableDiffusionSafetyChecker,
+        inverse_scheduler: DDIMInverseScheduler,
+        caption_generator: BlipForConditionalGeneration,
+        caption_processor: BlipProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         if safety_checker is None and requires_safety_checker:
@@ -356,19 +364,21 @@ def __init__(
             feature_extractor=feature_extractor,
             caption_processor=caption_processor,
             caption_generator=caption_generator,
-            inverse_scheduler=inverse_scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            inverse_scheduler=inverse_scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -408,29 +418,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -438,8 +450,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -449,21 +460,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -471,47 +483,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -532,66 +540,65 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            source_embeds,
-            target_embeds,
-            callback_steps,
-            prompt_embeds=None, ):
+        self,
+        prompt,
+        image,
+        source_embeds,
+        target_embeds,
+        callback_steps,
+        prompt_embeds=None,
+    ):
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
         if source_embeds is None and target_embeds is None:
-            raise ValueError(
-                "`source_embeds` and `target_embeds` cannot be undefined.")
+            raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.")
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
     #  Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -611,43 +618,38 @@ def generate_caption(self, images):
         # make sure cast caption_generator position_ids dtype int64
         try:
             self.caption_generator.text_decoder.bert.embeddings.position_ids = (
-                self.caption_generator.text_decoder.bert.embeddings.
-                position_ids.cast("int64"))
+                self.caption_generator.text_decoder.bert.embeddings.position_ids.cast("int64")
+            )
         except Exception:
             pass
         text = "a photography of"
 
-        inputs = self.caption_processor(
-            images=images, text=text, return_tensors="pd")
-        inputs["pixel_values"] = inputs["pixel_values"].cast(
-            self.caption_generator.dtype)
+        inputs = self.caption_processor(images=images, text=text, return_tensors="pd")
+        inputs["pixel_values"] = inputs["pixel_values"].cast(self.caption_generator.dtype)
         outputs = self.caption_generator.generate(**inputs, max_length=128)[0]
 
         # offload caption generator
-        caption = self.caption_processor.batch_decode(
-            outputs, skip_special_tokens=True)[0]
+        caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
         return text + " " + caption
 
-    def construct_direction(self,
-                            embs_source: paddle.Tensor,
-                            embs_target: paddle.Tensor):
+    def construct_direction(self, embs_source: paddle.Tensor, embs_target: paddle.Tensor):
         """Constructs the edit direction to steer the image generation process semantically."""
         return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0)
 
     @paddle.no_grad()
-    def get_embeds(self, prompt: List[str],
-                   batch_size: int=16) -> paddle.Tensor:
+    def get_embeds(self, prompt: List[str], batch_size: int = 16) -> paddle.Tensor:
         num_prompts = len(prompt)
         embeds = []
         for i in range(0, num_prompts, batch_size):
-            prompt_slice = prompt[i:i + batch_size]
+            prompt_slice = prompt[i : i + batch_size]
 
             input_ids = self.tokenizer(
                 prompt_slice,
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", ).input_ids
+                return_tensors="pd",
+            ).input_ids
 
             embeds.append(self.text_encoder(input_ids)[0])
 
@@ -668,10 +670,7 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None):
             )
 
         if isinstance(generator, list):
-            latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                for i in range(batch_size)
-            ]
+            latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)]
             latents = paddle.concat(latents, axis=0)
         else:
             latents = self.vae.encode(image).latent_dist.sample(generator)
@@ -691,10 +690,10 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None):
                     "len(prompt) != len(image)",
                     "1.0.0",
                     deprecation_message,
-                    standard_warn=False, )
+                    standard_warn=False,
+                )
                 additional_latents_per_image = batch_size // latents.shape[0]
-                latents = paddle.concat(
-                    [latents] * additional_latents_per_image, axis=0)
+                latents = paddle.concat([latents] * additional_latents_per_image, axis=0)
             else:
                 raise ValueError(
                     f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
@@ -704,21 +703,16 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None):
 
         return latents
 
-    def get_epsilon(self,
-                    model_output: paddle.Tensor,
-                    sample: paddle.Tensor,
-                    timestep: int):
+    def get_epsilon(self, model_output: paddle.Tensor, sample: paddle.Tensor, timestep: int):
         pred_type = self.inverse_scheduler.config.prediction_type
         alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
         beta_prod_t = 1 - alpha_prod_t
         if pred_type == "epsilon":
             return model_output
         elif pred_type == "sample":
-            return (sample - alpha_prod_t**
-                    (0.5) * model_output) / beta_prod_t**(0.5)
+            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
         elif pred_type == "v_prediction":
-            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5
-                                                         ) * sample
+            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
                 f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
@@ -728,15 +722,11 @@ def auto_corr_loss(self, hidden_states, generator=None):
         reg_loss = 0.0
         for i in range(hidden_states.shape[0]):
             for j in range(hidden_states.shape[1]):
-                noise = hidden_states[i:i + 1, j:j + 1, :, :]
+                noise = hidden_states[i : i + 1, j : j + 1, :, :]
                 while True:
-                    roll_amount = randint_tensor(
-                        noise.shape[2] // 2, shape=(1, ),
-                        generator=generator).item()
-                    reg_loss += (noise * paddle.roll(
-                        noise, shifts=roll_amount, axis=2)).mean()**2
-                    reg_loss += (noise * paddle.roll(
-                        noise, shifts=roll_amount, axis=3)).mean()**2
+                    roll_amount = randint_tensor(noise.shape[2] // 2, shape=(1,), generator=generator).item()
+                    reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=2)).mean() ** 2
+                    reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=3)).mean() ** 2
 
                     if noise.shape[2] <= 8:
                         break
@@ -751,29 +741,29 @@ def kl_divergence(self, hidden_states):
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Optional[Union[str, List[str]]]=None,
-            image: Optional[Union[paddle.Tensor, PIL.Image.Image]]=None,
-            source_embeds: paddle.Tensor=None,
-            target_embeds: paddle.Tensor=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            cross_attention_guidance_amount: float=0.1,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None,
+        source_embeds: paddle.Tensor = None,
+        target_embeds: paddle.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        cross_attention_guidance_amount: float = 0.1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -857,7 +847,8 @@ def __call__(
             source_embeds,
             target_embeds,
             callback_steps,
-            prompt_embeds, )
+            prompt_embeds,
+        )
 
         # 3. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -881,7 +872,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -897,7 +889,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
         latents_init = latents.clone()
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -908,37 +901,31 @@ def __call__(
         self.unet = prepare_unet(self.unet)
 
         # 7. Denoising loop where we obtain the cross-attention maps.
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs={"timestep": t}, ).sample
+                    cross_attention_kwargs={"timestep": t},
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -952,15 +939,12 @@ def __call__(
 
         # 10. Second denoising loop to generate the edited image.
         latents = latents_init
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # we want to learn the latent such that it steers the generation
                 # process towards the edited direction, so make the make initial
@@ -969,9 +953,7 @@ def __call__(
                 x_in.stop_gradient = False
 
                 # optimizer
-                opt = paddle.optimizer.SGD(
-                    parameters=[x_in],
-                    learning_rate=cross_attention_guidance_amount)
+                opt = paddle.optimizer.SGD(parameters=[x_in], learning_rate=cross_attention_guidance_amount)
 
                 with paddle.set_grad_enabled(True):
                     # initialize loss
@@ -982,8 +964,8 @@ def __call__(
                         x_in,
                         t,
                         encoder_hidden_states=prompt_embeds_edit.detach(),
-                        cross_attention_kwargs={"timestep": t,
-                                                "loss": loss}, ).sample
+                        cross_attention_kwargs={"timestep": t, "loss": loss},
+                    ).sample
 
                     loss.loss.backward(retain_graph=False)
                     opt.step()
@@ -993,32 +975,28 @@ def __call__(
                     x_in.detach(),
                     t,
                     encoder_hidden_states=prompt_embeds_edit,
-                    cross_attention_kwargs={"timestep": None}, ).sample
+                    cross_attention_kwargs={"timestep": None},
+                ).sample
 
                 latents = x_in.detach().chunk(2)[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
 
         # 11. Post-process the latents.
         edited_image = self.decode_latents(latents)
 
         # 12. Run the safety checker.
-        edited_image, has_nsfw_concept = self.run_safety_checker(
-            edited_image, prompt_embeds.dtype)
+        edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, prompt_embeds.dtype)
 
         # 13. Convert to PIL.
         if output_type == "pil":
@@ -1027,31 +1005,30 @@ def __call__(
         if not return_dict:
             return (edited_image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=edited_image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept)
 
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
     def invert(
-            self,
-            prompt: Optional[str]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=1,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            cross_attention_guidance_amount: float=0.1,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            lambda_auto_corr: float=20.0,
-            lambda_kl: float=20.0,
-            num_reg_steps: int=5,
-            num_auto_corr_rolls: int=5, ):
+        self,
+        prompt: Optional[str] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        cross_attention_guidance_amount: float = 0.1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        lambda_auto_corr: float = 20.0,
+        lambda_kl: float = 20.0,
+        num_reg_steps: int = 5,
+        num_auto_corr_rolls: int = 5,
+    ):
         r"""
         Function used to generate inverted latents given a prompt and image.
 
@@ -1130,8 +1107,7 @@ def invert(
         image = preprocess(image)
 
         # 4. Prepare latent variables
-        latents = self.prepare_image_latents(image, batch_size, self.vae.dtype,
-                                             generator)
+        latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, generator)
 
         # 5. Encode input prompt
         num_images_per_prompt = 1
@@ -1139,7 +1115,8 @@ def invert(
             prompt,
             num_images_per_prompt,
             do_classifier_free_guidance,
-            prompt_embeds=prompt_embeds, )
+            prompt_embeds=prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.inverse_scheduler.set_timesteps(num_inference_steps)
@@ -1150,28 +1127,25 @@ def invert(
         self.unet = prepare_unet(self.unet)
 
         # 7. Denoising loop where we obtain the cross-attention maps.
-        num_warmup_steps = (
-            len(timesteps) - num_inference_steps * self.inverse_scheduler.order)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
         with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
             for i, t in enumerate(timesteps[:-1]):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.inverse_scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs={"timestep": t}, ).sample
+                    cross_attention_kwargs={"timestep": t},
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # regularization of the noise prediction
                 with paddle.set_grad_enabled(True):
@@ -1182,11 +1156,9 @@ def invert(
                                 var.stop_gradient = False
 
                                 # Derive epsilon from model output before regularizing to IID standard normal
-                                var_epsilon = self.get_epsilon(
-                                    var, latent_model_input.detach(), t)
+                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
 
-                                l_ac = self.auto_corr_loss(
-                                    var_epsilon, generator=generator)
+                                l_ac = self.auto_corr_loss(var_epsilon, generator=generator)
                                 l_ac.backward()
 
                                 grad = var.grad.detach() / num_auto_corr_rolls
@@ -1197,8 +1169,7 @@ def invert(
                             var.stop_gradient = False
 
                             # Derive epsilon from model output before regularizing to IID standard normal
-                            var_epsilon = self.get_epsilon(
-                                var, latent_model_input.detach(), t)
+                            var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
 
                             l_kld = self.kl_divergence(var_epsilon)
                             l_kld.backward()
@@ -1209,13 +1180,12 @@ def invert(
                         noise_pred = noise_pred.detach()
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.inverse_scheduler.step(noise_pred, t,
-                                                      latents).prev_sample
+                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
 
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.inverse_scheduler.order == 0):
+                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+                ):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -1232,5 +1202,4 @@ def invert(
         if not return_dict:
             return (inverted_latents, image)
 
-        return Pix2PixInversionPipelineOutput(
-            latents=inverted_latents, images=image)
+        return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 56fac99a80c30..3a8030d6a986d 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -17,8 +17,7 @@
 
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -52,21 +51,20 @@ def __init__(self):
         self.attention_probs = None
 
     def __call__(
-            self,
-            attn,
-            hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=None, ):
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
         batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
@@ -90,8 +88,7 @@ def __call__(
 
 
 # Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
-class StableDiffusionSAGPipeline(DiffusionPipeline,
-                                 TextualInversionLoaderMixin):
+class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
 
@@ -121,15 +118,16 @@ class StableDiffusionSAGPipeline(DiffusionPipeline,
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -139,19 +137,21 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -191,29 +191,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -221,8 +223,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -232,21 +233,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -254,47 +256,43 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
         else:
             has_nsfw_concept = None
         return image, has_nsfw_concept
@@ -315,54 +313,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -375,23 +369,26 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -408,26 +405,26 @@ def prepare_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            sag_scale: float=0.75,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        sag_scale: float = 0.75,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -512,7 +509,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -538,7 +536,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -553,17 +552,16 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
         store_processor = CrossAttnStoreProcessor()
-        self.unet.mid_block.attentions[0].transformer_blocks[
-            0].attn1.processor = store_processor
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         map_size = None
 
@@ -571,28 +569,25 @@ def get_map_size(module, input, output):
             nonlocal map_size
             map_size = output.sample.shape[-2:]
 
-        forward_hook = self.unet.mid_block.attentions[
-            0].register_forward_post_hook(get_map_size)
+        forward_hook = self.unet.mid_block.attentions[0].register_forward_post_hook(get_map_size)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # perform self-attention guidance with the stored self-attentnion map
                 if do_self_attention_guidance:
@@ -603,23 +598,19 @@ def get_map_size(module, input, output):
                         # DDIM-like prediction of x0
                         pred_x0 = self.pred_x0(latents, noise_pred_uncond, t)
                         # get the stored attention maps
-                        uncond_attn, cond_attn = store_processor.attention_probs.chunk(
-                            2)
+                        uncond_attn, cond_attn = store_processor.attention_probs.chunk(2)
                         # self-attention-based degrading of latents
                         degraded_latents = self.sag_masking(
                             pred_x0,
                             uncond_attn,
                             map_size,
                             t,
-                            self.pred_epsilon(latents, noise_pred_uncond, t), )
+                            self.pred_epsilon(latents, noise_pred_uncond, t),
+                        )
                         uncond_emb, _ = prompt_embeds.chunk(2)
                         # forward and give guidance
-                        degraded_pred = self.unet(
-                            degraded_latents,
-                            t,
-                            encoder_hidden_states=uncond_emb).sample
-                        noise_pred += sag_scale * (
-                            noise_pred_uncond - degraded_pred)
+                        degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample
+                        noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
                     else:
                         # DDIM-like prediction of x0
                         pred_x0 = self.pred_x0(latents, noise_pred, t)
@@ -631,22 +622,17 @@ def get_map_size(module, input, output):
                             cond_attn,
                             map_size,
                             t,
-                            self.pred_epsilon(latents, noise_pred, t), )
+                            self.pred_epsilon(latents, noise_pred, t),
+                        )
                         # forward and give guidance
-                        degraded_pred = self.unet(
-                            degraded_latents,
-                            t,
-                            encoder_hidden_states=prompt_embeds).sample
+                        degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample
                         noise_pred += sag_scale * (noise_pred - degraded_pred)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -656,8 +642,7 @@ def get_map_size(module, input, output):
         image = self.decode_latents(latents)
 
         # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image,
-                                                          prompt_embeds.dtype)
+        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         # 10. Convert to PIL
         if output_type == "pil":
@@ -666,8 +651,7 @@ def get_map_size(module, input, output):
         if not return_dict:
             return (image, has_nsfw_concept)
 
-        return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
     def sag_masking(self, original_latents, attn_map, map_size, t, eps):
         # Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf
@@ -681,20 +665,20 @@ def sag_masking(self, original_latents, attn_map, map_size, t, eps):
         attn_map = attn_map.reshape([b, h, hw1, hw2])
         attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0
 
-        attn_mask = (attn_mask.reshape([b, map_size[0], map_size[1]])
-                     .unsqueeze(1).tile([1, latent_channel, 1, 1])
-                     .cast(attn_map.dtype))
+        attn_mask = (
+            attn_mask.reshape([b, map_size[0], map_size[1]])
+            .unsqueeze(1)
+            .tile([1, latent_channel, 1, 1])
+            .cast(attn_map.dtype)
+        )
         attn_mask = F.interpolate(attn_mask, (latent_h, latent_w))
 
         # Blur according to the self-attention mask
-        degraded_latents = gaussian_blur_2d(
-            original_latents, kernel_size=9, sigma=1.0)
-        degraded_latents = degraded_latents * attn_mask + original_latents * (
-            1 - attn_mask)
+        degraded_latents = gaussian_blur_2d(original_latents, kernel_size=9, sigma=1.0)
+        degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask)
 
         # Noise it again to match the noise level
-        degraded_latents = self.scheduler.add_noise(
-            degraded_latents, noise=eps, timesteps=t)
+        degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t)
 
         return degraded_latents
 
@@ -705,20 +689,18 @@ def pred_x0(self, sample, model_output, timestep):
 
         beta_prod_t = 1 - alpha_prod_t
         if self.scheduler.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t**
-                                    (0.5) * model_output) / alpha_prod_t**(0.5)
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
         elif self.scheduler.config.prediction_type == "sample":
             pred_original_sample = model_output
         elif self.scheduler.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (
-                beta_prod_t**0.5) * model_output
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
             # predict V
-            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
-                                                                 0.5) * sample
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
                 f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
-                " or `v_prediction`")
+                " or `v_prediction`"
+            )
 
         return pred_original_sample
 
@@ -729,15 +711,14 @@ def pred_epsilon(self, sample, model_output, timestep):
         if self.scheduler.config.prediction_type == "epsilon":
             pred_eps = model_output
         elif self.scheduler.config.prediction_type == "sample":
-            pred_eps = (sample -
-                        (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
+            pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
         elif self.scheduler.config.prediction_type == "v_prediction":
-            pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5
-                                                      ) * model_output
+            pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output
         else:
             raise ValueError(
                 f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
-                " or `v_prediction`")
+                " or `v_prediction`"
+            )
 
         return pred_eps
 
@@ -753,12 +734,9 @@ def gaussian_blur_2d(img, kernel_size, sigma):
     x_kernel = x_kernel.cast(img.dtype)
 
     kernel2d = paddle.matmul(x_kernel[:, None], x_kernel[None, :])
-    kernel2d = kernel2d.expand(
-        [img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]])
+    kernel2d = kernel2d.expand([img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]])
 
-    padding = [
-        kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2
-    ]
+    padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
 
     img = F.pad(img, padding, mode="reflect")
     img = F.conv2d(img, kernel2d, groups=img.shape[-3])
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 4a2ca10a74b68..85b0706b3ed80 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -37,8 +37,7 @@ def preprocess(image):
 
     if isinstance(image[0], PIL.Image.Image):
         w, h = image[0].size
-        w, h = map(lambda x: x - x % 64,
-                   (w, h))  # resize to integer multiple of 64
+        w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
 
         image = [np.array(i.resize((w, h)))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
@@ -78,20 +77,21 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            low_res_scheduler: DDPMScheduler,
-            scheduler: KarrasDiffusionSchedulers,
-            max_noise_level: int=350, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        max_noise_level: int = 350,
+    ):
         super().__init__()
 
         # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
         is_vae_scaling_factor_set_to_0_08333 = (
-            hasattr(vae.config, "scaling_factor") and
-            vae.config.scaling_factor == 0.08333)
+            hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
+        )
         if not is_vae_scaling_factor_set_to_0_08333:
             deprecation_message = (
                 "The configuration file of the vae does not contain `scaling_factor` or it is set to"
@@ -105,7 +105,8 @@ def __init__(
                 "wrong scaling_factor",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             vae.register_to_config(scaling_factor=0.08333)
 
         self.register_modules(
@@ -114,18 +115,20 @@ def __init__(
             tokenizer=tokenizer,
             unet=unet,
             low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
         self.register_to_config(max_noise_level=max_noise_level)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -161,29 +164,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -191,8 +196,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -202,14 +206,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -219,36 +225,33 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -259,15 +262,13 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -283,13 +284,13 @@ def decode_latents(self, latents):
 
     def check_inputs(self, prompt, image, noise_level, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
             )
@@ -312,34 +313,32 @@ def check_inputs(self, prompt, image, noise_level, callback_steps):
 
         # check noise level
         if noise_level > self.config.max_noise_level:
-            raise ValueError(
-                f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}"
-            )
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (batch_size, num_channels_latents, height, width)
         if latents is None:
             latents = randn_tensor(shape, generator=generator, dtype=dtype)
         else:
             if latents.shape != list(shape):
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
             latents = latents
 
         # scale the initial noise by the standard deviation required by the scheduler
@@ -348,25 +347,24 @@ def prepare_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            image: Union[paddle.Tensor, PIL.Image.Image, List[
-                PIL.Image.Image]]=None,
-            num_inference_steps: int=75,
-            guidance_scale: float=9.0,
-            noise_level: int=20,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -472,7 +470,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Preprocess image
         image = preprocess(image)
@@ -484,13 +483,11 @@ def __call__(
 
         # 5. Add noise to image
         noise_level = paddle.to_tensor([noise_level], dtype="int64")
-        noise = randn_tensor(
-            image.shape, generator=generator, dtype=prompt_embeds.dtype)
+        noise = randn_tensor(image.shape, generator=generator, dtype=prompt_embeds.dtype)
         image = self.low_res_scheduler.add_noise(image, noise, noise_level)
 
         batch_multiplier = 2 if do_classifier_free_guidance else 1
-        image = paddle.concat([image] * batch_multiplier *
-                              num_images_per_prompt)
+        image = paddle.concat([image] * batch_multiplier * num_images_per_prompt)
         noise_level = paddle.concat([noise_level] * image.shape[0])
 
         # 6. Prepare latent variables
@@ -503,7 +500,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 7. Check that sizes of image and latents match
         num_channels_image = image.shape[1]
@@ -513,48 +511,41 @@ def __call__(
                 f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                 f" `num_channels_image`: {num_channels_image} "
                 f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input.")
+                " `pipeline.unet` or your `image` input."
+            )
 
         # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 9. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
 
                 # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
-                latent_model_input = paddle.concat(
-                    [latent_model_input, image.cast(latent_model_input.dtype)],
-                    axis=1)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = paddle.concat([latent_model_input, image.cast(latent_model_input.dtype)], axis=1)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    class_labels=noise_level, ).sample
+                    class_labels=noise_level,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -569,6 +560,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index f89be55fdda9d..eaa7be8cb0324 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -17,8 +17,11 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPTextModel, CLIPTextModelWithProjection,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import (
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
 from paddlenlp.transformers.clip.modeling import CLIPTextModelOutput
 
 from ...loaders import TextualInversionLoaderMixin
@@ -26,6 +29,7 @@
 from ...models.embeddings import get_timestep_embedding
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging, randn_tensor, replace_example_docstring
+
 # from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -103,22 +107,23 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     vae: AutoencoderKL
 
     def __init__(
-            self,
-            # prior components
-            prior_tokenizer: CLIPTokenizer,
-            prior_text_encoder: CLIPTextModelWithProjection,
-            prior: PriorTransformer,
-            prior_scheduler: KarrasDiffusionSchedulers,
-            # image noising components
-            image_normalizer: StableUnCLIPImageNormalizer,
-            image_noising_scheduler: KarrasDiffusionSchedulers,
-            # regular denoising components
-            tokenizer: CLIPTokenizer,
-            text_encoder: CLIPTextModelWithProjection,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            # vae
-            vae: AutoencoderKL, ):
+        self,
+        # prior components
+        prior_tokenizer: CLIPTokenizer,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior: PriorTransformer,
+        prior_scheduler: KarrasDiffusionSchedulers,
+        # image noising components
+        image_normalizer: StableUnCLIPImageNormalizer,
+        image_noising_scheduler: KarrasDiffusionSchedulers,
+        # regular denoising components
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        # vae
+        vae: AutoencoderKL,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -132,18 +137,20 @@ def __init__(
             text_encoder=text_encoder,
             unet=unet,
             scheduler=scheduler,
-            vae=vae, )
+            vae=vae,
+        )
 
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
     def _encode_prior_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None,
-            text_attention_mask: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[paddle.Tensor] = None,
+    ):
         if text_model_output is None:
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             # get prompt text embeddings
@@ -153,44 +160,42 @@ def _encode_prior_prompt(
                 max_length=self.prior_tokenizer.model_max_length,
                 return_attention_mask=True,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
             text_mask = text_inputs.attention_mask
 
-            untruncated_ids = self.prior_tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.prior_tokenizer.batch_decode(
-                    untruncated_ids[:, self.prior_tokenizer.model_max_length -
-                                    1:-1])
+                    untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
                     f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
                 )
-                text_input_ids = text_input_ids[:, :self.prior_tokenizer.
-                                                model_max_length]
+                text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
 
             prior_text_encoder_output = self.prior_text_encoder(text_input_ids)
 
             prompt_embeds = prior_text_encoder_output.text_embeds
-            prior_text_encoder_hidden_states = (
-                prior_text_encoder_output.last_hidden_state)
+            prior_text_encoder_hidden_states = prior_text_encoder_output.last_hidden_state
 
         else:
             batch_size = text_model_output[0].shape[0]
             prompt_embeds, prior_text_encoder_hidden_states = (
                 text_model_output[0],
-                text_model_output[1], )
+                text_model_output[1],
+            )
             text_mask = text_attention_mask
 
-        prompt_embeds = prompt_embeds.repeat_interleave(
-            num_images_per_prompt, axis=0)
-        prior_text_encoder_hidden_states = (
-            prior_text_encoder_hidden_states.repeat_interleave(
-                num_images_per_prompt, axis=0))
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
+        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.repeat_interleave(
+            num_images_per_prompt, axis=0
+        )
 
         text_mask = text_mask.repeat_interleave(num_images_per_prompt, axis=0)
 
@@ -203,46 +208,43 @@ def _encode_prior_prompt(
                 max_length=self.prior_tokenizer.model_max_length,
                 return_attention_mask=True,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             uncond_text_mask = uncond_input.attention_mask
-            negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(
-                uncond_input.input_ids)
+            negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids)
 
-            negative_prompt_embeds = (
-                negative_prompt_embeds_prior_text_encoder_output.text_embeds)
+            negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
             uncond_prior_text_encoder_hidden_states = (
-                negative_prompt_embeds_prior_text_encoder_output.
-                last_hidden_state)
+                negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
+            )
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
 
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
 
             seq_len = uncond_prior_text_encoder_hidden_states.shape[1]
-            uncond_prior_text_encoder_hidden_states = (
-                uncond_prior_text_encoder_hidden_states.tile(
-                    [1, num_images_per_prompt, 1]))
-            uncond_prior_text_encoder_hidden_states = (
-                uncond_prior_text_encoder_hidden_states.reshape(
-                    [batch_size * num_images_per_prompt, seq_len, -1]))
-            uncond_text_mask = uncond_text_mask.repeat_interleave(
-                num_images_per_prompt, axis=0)
+            uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.tile(
+                [1, num_images_per_prompt, 1]
+            )
+            uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.reshape(
+                [batch_size * num_images_per_prompt, seq_len, -1]
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
 
             # done duplicates
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
-            prior_text_encoder_hidden_states = paddle.concat([
-                uncond_prior_text_encoder_hidden_states,
-                prior_text_encoder_hidden_states,
-            ])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+            prior_text_encoder_hidden_states = paddle.concat(
+                [
+                    uncond_prior_text_encoder_hidden_states,
+                    prior_text_encoder_hidden_states,
+                ]
+            )
 
             text_mask = paddle.concat([uncond_text_mask, text_mask])
 
@@ -250,13 +252,14 @@ def _encode_prior_prompt(
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -296,29 +299,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -326,8 +331,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -337,21 +341,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -359,36 +364,33 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -408,15 +410,13 @@ def prepare_prior_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.prior_scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the prior_scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.prior_scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -428,40 +428,38 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            noise_level,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        noise_level,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
@@ -473,11 +471,8 @@ def check_inputs(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
 
-        if prompt is not None and (not isinstance(prompt, str) and
-                                   not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -488,17 +483,18 @@ def check_inputs(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt_embeds is not None and negative_prompt_embeds is not None:
             if prompt_embeds.shape != negative_prompt_embeds.shape:
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
-        if (noise_level < 0 or noise_level >=
-                self.image_noising_scheduler.config.num_train_timesteps):
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
             raise ValueError(
                 f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
             )
@@ -509,20 +505,19 @@ def prepare_latents(self, shape, dtype, generator, latents, scheduler):
             latents = randn_tensor(shape, generator=generator, dtype=dtype)
         else:
             if latents.shape != list(shape):
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
             latents = latents
 
         latents = latents * scheduler.init_noise_sigma
         return latents
 
     def noise_image_embeddings(
-            self,
-            image_embeds: paddle.Tensor,
-            noise_level: int,
-            noise: Optional[paddle.Tensor]=None,
-            generator: Optional[paddle.Generator]=None, ):
+        self,
+        image_embeds: paddle.Tensor,
+        noise_level: int,
+        noise: Optional[paddle.Tensor] = None,
+        generator: Optional[paddle.Generator] = None,
+    ):
         """
         Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
         `noise_level` increases the variance in the final un-noised images.
@@ -536,17 +531,13 @@ def noise_image_embeddings(
         The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
         """
         if noise is None:
-            noise = randn_tensor(
-                image_embeds.shape,
-                generator=generator,
-                dtype=image_embeds.dtype)
+            noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype)
 
         noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0])
 
         image_embeds = self.image_normalizer.scale(image_embeds)
 
-        image_embeds = self.image_noising_scheduler.add_noise(
-            image_embeds, timesteps=noise_level, noise=noise)
+        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
 
         image_embeds = self.image_normalizer.unscale(image_embeds)
 
@@ -554,7 +545,8 @@ def noise_image_embeddings(
             timesteps=noise_level,
             embedding_dim=image_embeds.shape[-1],
             flip_sin_to_cos=True,
-            downscale_freq_shift=0, )
+            downscale_freq_shift=0,
+        )
 
         # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
         # but we might actually be running in fp16. so we need to cast here.
@@ -568,30 +560,31 @@ def noise_image_embeddings(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            # regular denoising process args
-            prompt: Optional[Union[str, List[str]]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=20,
-            guidance_scale: float=10.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            noise_level: int=0,
-            # prior args
-            prior_num_inference_steps: int=25,
-            prior_guidance_scale: float=4.0,
-            prior_latents: Optional[paddle.Tensor]=None, ):
+        self,
+        # regular denoising process args
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        # prior args
+        prior_num_inference_steps: int = 25,
+        prior_guidance_scale: float = 4.0,
+        prior_latents: Optional[paddle.Tensor] = None,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -687,7 +680,8 @@ def __call__(
             noise_level=noise_level,
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -705,13 +699,11 @@ def __call__(
         prior_do_classifier_free_guidance = prior_guidance_scale > 1.0
 
         # 3. Encode input prompt
-        (
-            prior_prompt_embeds,
-            prior_text_encoder_hidden_states,
-            prior_text_mask, ) = self._encode_prior_prompt(
-                prompt=prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=prior_do_classifier_free_guidance, )
+        (prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask,) = self._encode_prior_prompt(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=prior_do_classifier_free_guidance,
+        )
 
         # 4. Prepare prior timesteps
         self.prior_scheduler.set_timesteps(prior_num_inference_steps)
@@ -724,43 +716,43 @@ def __call__(
             prior_prompt_embeds.dtype,
             generator,
             prior_latents,
-            self.prior_scheduler, )
+            self.prior_scheduler,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(
-            generator, eta)
+        prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta)
 
         # 7. Prior denoising loop
         for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([prior_latents] * 2)
-                                  if prior_do_classifier_free_guidance else
-                                  prior_latents)
-            latent_model_input = self.prior_scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = (
+                paddle.concat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents
+            )
+            latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t)
 
             predicted_image_embedding = self.prior(
                 latent_model_input,
                 timestep=t,
                 proj_embedding=prior_prompt_embeds,
                 encoder_hidden_states=prior_text_encoder_hidden_states,
-                attention_mask=prior_text_mask, ).predicted_image_embedding
+                attention_mask=prior_text_mask,
+            ).predicted_image_embedding
 
             if prior_do_classifier_free_guidance:
                 (
                     predicted_image_embedding_uncond,
                     predicted_image_embedding_text,
                 ) = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = (
-                    predicted_image_embedding_uncond + prior_guidance_scale *
-                    (predicted_image_embedding_text -
-                     predicted_image_embedding_uncond))
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
 
             prior_latents = self.prior_scheduler.step(
                 predicted_image_embedding,
                 timestep=t,
                 sample=prior_latents,
-                **prior_extra_step_kwargs, ).prev_sample
+                **prior_extra_step_kwargs,
+            ).prev_sample
 
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, prior_latents)
@@ -783,13 +775,15 @@ def __call__(
             do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 9. Prepare image embeddings
         image_embeds = self.noise_image_embeddings(
             image_embeds=image_embeds,
             noise_level=noise_level,
-            generator=generator, )
+            generator=generator,
+        )
 
         if do_classifier_free_guidance:
             negative_prompt_embeds = paddle.zeros_like(image_embeds)
@@ -809,23 +803,23 @@ def __call__(
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         latents = self.prepare_latents(
             shape=shape,
             dtype=prompt_embeds.dtype,
             generator=generator,
             latents=latents,
-            scheduler=self.scheduler, )
+            scheduler=self.scheduler,
+        )
 
         # 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 13. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
             noise_pred = self.unet(
@@ -833,17 +827,16 @@ def __call__(
                 t,
                 encoder_hidden_states=prompt_embeds,
                 class_labels=image_embeds,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
@@ -856,6 +849,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 043b5a310a9de..288dccda66f3b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -18,9 +18,12 @@
 
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer,
-                                    CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -60,8 +63,7 @@
 """
 
 
-class StableUnCLIPImg2ImgPipeline(DiffusionPipeline,
-                                  TextualInversionLoaderMixin):
+class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     """
     Pipeline for text-guided image to image generation using stable unCLIP.
 
@@ -108,20 +110,21 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline,
     vae: AutoencoderKL
 
     def __init__(
-            self,
-            # image encoding components
-            feature_extractor: CLIPImageProcessor,
-            image_encoder: CLIPVisionModelWithProjection,
-            # image noising components
-            image_normalizer: StableUnCLIPImageNormalizer,
-            image_noising_scheduler: KarrasDiffusionSchedulers,
-            # regular denoising components
-            tokenizer: CLIPTokenizer,
-            text_encoder: CLIPTextModel,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            # vae
-            vae: AutoencoderKL, ):
+        self,
+        # image encoding components
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        # image noising components
+        image_normalizer: StableUnCLIPImageNormalizer,
+        image_noising_scheduler: KarrasDiffusionSchedulers,
+        # regular denoising components
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        # vae
+        vae: AutoencoderKL,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -133,19 +136,21 @@ def __init__(
             text_encoder=text_encoder,
             unet=unet,
             scheduler=scheduler,
-            vae=vae, )
+            vae=vae,
+        )
 
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -185,29 +190,31 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
 
             prompt_embeds = self.text_encoder(
                 text_input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             prompt_embeds = prompt_embeds[0]
 
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
@@ -215,8 +222,7 @@ def _encode_prompt(
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -226,21 +232,22 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
 
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
@@ -248,48 +255,46 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
 
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     def _encode_image(
-            self,
-            image,
-            batch_size,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            noise_level,
-            generator,
-            image_embeds, ):
+        self,
+        image,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        noise_level,
+        generator,
+        image_embeds,
+    ):
         dtype = self.image_encoder.dtype
 
         if isinstance(image, PIL.Image.Image):
@@ -306,8 +311,7 @@ def _encode_image(
 
         if image_embeds is None:
             if not isinstance(image, paddle.Tensor):
-                image = self.feature_extractor(
-                    images=image, return_tensors="pd").pixel_values
+                image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
 
             image = image.cast(dtype)
             image_embeds = self.image_encoder(image).image_embeds
@@ -315,7 +319,8 @@ def _encode_image(
         image_embeds = self.noise_image_embeddings(
             image_embeds=image_embeds,
             noise_level=noise_level,
-            generator=generator, )
+            generator=generator,
+        )
 
         # duplicate image embeddings for each generation per prompt, using mps friendly method
         image_embeds = image_embeds.unsqueeze(1)
@@ -350,42 +355,40 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            image,
-            height,
-            width,
-            callback_steps,
-            noise_level,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None,
-            image_embeds=None, ):
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        noise_level,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
@@ -397,11 +400,8 @@ def check_inputs(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
 
-        if prompt is not None and (not isinstance(prompt, str) and
-                                   not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -412,17 +412,18 @@ def check_inputs(
             if type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
 
         if prompt_embeds is not None and negative_prompt_embeds is not None:
             if prompt_embeds.shape != negative_prompt_embeds.shape:
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
-        if (noise_level < 0 or noise_level >=
-                self.image_noising_scheduler.config.num_train_timesteps):
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
             raise ValueError(
                 f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
             )
@@ -438,28 +439,33 @@ def check_inputs(
             )
 
         if image is not None:
-            if (not isinstance(image, paddle.Tensor) and
-                    not isinstance(image, PIL.Image.Image) and
-                    not isinstance(image, list)):
+            if (
+                not isinstance(image, paddle.Tensor)
+                and not isinstance(image, PIL.Image.Image)
+                and not isinstance(image, list)
+            ):
                 raise ValueError(
                     "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                    f" {type(image)}")
+                    f" {type(image)}"
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -475,11 +481,12 @@ def prepare_latents(
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings
     def noise_image_embeddings(
-            self,
-            image_embeds: paddle.Tensor,
-            noise_level: int,
-            noise: Optional[paddle.Tensor]=None,
-            generator: Optional[paddle.Generator]=None, ):
+        self,
+        image_embeds: paddle.Tensor,
+        noise_level: int,
+        noise: Optional[paddle.Tensor] = None,
+        generator: Optional[paddle.Generator] = None,
+    ):
         """
         Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
         `noise_level` increases the variance in the final un-noised images.
@@ -493,18 +500,12 @@ def noise_image_embeddings(
         The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
         """
         if noise is None:
-            noise = randn_tensor(
-                image_embeds.shape,
-                generator=generator,
-                dtype=image_embeds.dtype)
-        noise_level = paddle.to_tensor([noise_level] *
-                                       image_embeds.shape[0]).reshape(
-                                           [image_embeds.shape[0]])
+            noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype)
+        noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0]).reshape([image_embeds.shape[0]])
 
         image_embeds = self.image_normalizer.scale(image_embeds)
 
-        image_embeds = self.image_noising_scheduler.add_noise(
-            image_embeds, timesteps=noise_level, noise=noise)
+        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
 
         image_embeds = self.image_normalizer.unscale(image_embeds)
 
@@ -512,7 +513,8 @@ def noise_image_embeddings(
             timesteps=noise_level,
             embedding_dim=image_embeds.shape[-1],
             flip_sin_to_cos=True,
-            downscale_freq_shift=0, )
+            downscale_freq_shift=0,
+        )
 
         # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
         # but we might actually be running in fp16. so we need to cast here.
@@ -525,27 +527,28 @@ def noise_image_embeddings(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            image: Union[paddle.Tensor, PIL.Image.Image]=None,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=20,
-            guidance_scale: float=10,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[paddle.Generator]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            noise_level: int=0,
-            image_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        image: Union[paddle.Tensor, PIL.Image.Image] = None,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 10,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[paddle.Generator] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        image_embeds: Optional[paddle.Tensor] = None,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -641,7 +644,8 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            image_embeds=image_embeds, )
+            image_embeds=image_embeds,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -665,7 +669,8 @@ def __call__(
             do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Encoder input image
         noise_level = paddle.to_tensor(noise_level)
@@ -676,7 +681,8 @@ def __call__(
             do_classifier_free_guidance=do_classifier_free_guidance,
             noise_level=noise_level,
             generator=generator,
-            image_embeds=image_embeds, )
+            image_embeds=image_embeds,
+        )
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -691,17 +697,16 @@ def __call__(
             width=width,
             dtype=prompt_embeds.dtype,
             generator=generator,
-            latents=latents, )
+            latents=latents,
+        )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
             noise_pred = self.unet(
@@ -709,17 +714,16 @@ def __call__(
                 t,
                 encoder_hidden_states=prompt_embeds,
                 class_labels=image_embeds,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
@@ -732,6 +736,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
index 8fa2d0f3796b1..28920a1c6de42 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
@@ -16,8 +16,11 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig,
-                                    CLIPVisionModel)
+from paddlenlp.transformers import (
+    CLIPPretrainedModel,
+    CLIPVisionConfig,
+    CLIPVisionModel,
+)
 
 from ...utils import logging
 
@@ -27,8 +30,7 @@
 def cosine_distance(image_embeds, text_embeds):
     normalized_image_embeds = F.normalize(image_embeds)
     normalized_text_embeds = F.normalize(text_embeds)
-    return paddle.matmul(
-        normalized_image_embeds, normalized_text_embeds, transpose_y=True)
+    return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True)
 
 
 class StableDiffusionSafetyChecker(CLIPPretrainedModel):
@@ -40,12 +42,11 @@ def __init__(self, config: CLIPVisionConfig):
         self.clip = CLIPVisionModel(config)
         self.vision_projection = paddle.create_parameter(
             (config.hidden_size, config.projection_dim),
-            dtype=paddle.get_default_dtype(), )
+            dtype=paddle.get_default_dtype(),
+        )
 
-        self.register_buffer("concept_embeds",
-                             paddle.ones([17, config.projection_dim]))
-        self.register_buffer("special_care_embeds",
-                             paddle.ones([3, config.projection_dim]))
+        self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim]))
+        self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim]))
 
         self.register_buffer("concept_embeds_weights", paddle.ones([17]))
         self.register_buffer("special_care_embeds_weights", paddle.ones([3]))
@@ -56,11 +57,8 @@ def forward(self, clip_input, images):
         image_embeds = paddle.matmul(pooled_output, self.vision_projection)
 
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        special_cos_dist = (
-            cosine_distance(image_embeds, self.special_care_embeds)
-            .astype("float32").numpy())
-        cos_dist = (cosine_distance(
-            image_embeds, self.concept_embeds).astype("float32").numpy())
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy()
 
         result = []
         batch_size = image_embeds.shape[0]
@@ -78,22 +76,16 @@ def forward(self, clip_input, images):
 
             for concept_idx in range(len(special_cos_dist[0])):
                 concept_cos = special_cos_dist[i][concept_idx]
-                concept_threshold = self.special_care_embeds_weights[
-                    concept_idx].item()
-                result_img["special_scores"][concept_idx] = round(
-                    concept_cos - concept_threshold + adjustment, 3)
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
                 if result_img["special_scores"][concept_idx] > 0:
-                    result_img["special_care"].append({
-                        concept_idx, result_img["special_scores"][concept_idx]
-                    })
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
                     adjustment = 0.01
 
             for concept_idx in range(len(cos_dist[0])):
                 concept_cos = cos_dist[i][concept_idx]
-                concept_threshold = self.concept_embeds_weights[
-                    concept_idx].item()
-                result_img["concept_scores"][concept_idx] = round(
-                    concept_cos - concept_threshold + adjustment, 3)
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
                 if result_img["concept_scores"][concept_idx] > 0:
                     result_img["bad_concepts"].append(concept_idx)
 
@@ -111,34 +103,29 @@ def forward(self, clip_input, images):
         if any(has_nsfw_concepts):
             logger.warning(
                 "Potential NSFW content was detected in one or more images. A black image will be returned instead."
-                " Try again with a different prompt and/or seed.")
+                " Try again with a different prompt and/or seed."
+            )
 
         return images, has_nsfw_concepts
 
-    def forward_fastdeploy(self,
-                           clip_input: paddle.Tensor,
-                           images: paddle.Tensor):
+    def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor):
         pooled_output = self.clip(clip_input)[1]  # pooled_output
         image_embeds = paddle.matmul(pooled_output, self.vision_projection)
 
-        special_cos_dist = cosine_distance(image_embeds,
-                                           self.special_care_embeds)
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
         cos_dist = cosine_distance(image_embeds, self.concept_embeds)
 
         # increase this value to create a stronger `nsfw` filter
         # at the cost of increasing the possibility of filtering benign images
         adjustment = 0.0
 
-        special_scores = (
-            special_cos_dist - self.special_care_embeds_weights + adjustment)
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
         # special_scores = special_scores.round(decimals=3)
         special_care = paddle.any(special_scores > 0, axis=1)
         special_adjustment = special_care * 0.01
-        special_adjustment = special_adjustment.unsqueeze(1).expand(
-            [-1, cos_dist.shape[1]])
+        special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]])
 
-        concept_scores = (cos_dist - self.concept_embeds_weights
-                          ) + special_adjustment
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
         # concept_scores = concept_scores.round(decimals=3)
         has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1)
 
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
index dd502e817aac3..8792792dd7fc4 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
@@ -32,34 +32,38 @@ class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            embedding_dim: int=768, ):
+        self,
+        embedding_dim: int = 768,
+    ):
         super().__init__()
 
         self.mean = self.create_parameter(
             (1, embedding_dim),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0), )
+            default_initializer=nn.initializer.Constant(0.0),
+        )
         self.std = self.create_parameter(
             (1, embedding_dim),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(1.0), )
+            default_initializer=nn.initializer.Constant(1.0),
+        )
 
     def to(
-            self,
-            device: Optional[str]=None,
-            dtype: Optional[paddle.dtype]=None, ):
+        self,
+        device: Optional[str] = None,
+        dtype: Optional[paddle.dtype] = None,
+    ):
         if dtype is not None:
             self.mean = self.create_parameter(
                 self.mean.shape,
                 dtype=dtype,
-                default_initializer=paddle.nn.initializer.Assign(
-                    self.mean.numpy()), )
+                default_initializer=paddle.nn.initializer.Assign(self.mean.numpy()),
+            )
             self.std = self.create_parameter(
                 self.std.shape,
                 dtype=dtype,
-                default_initializer=paddle.nn.initializer.Assign(self.std.numpy(
-                )), )
+                default_initializer=paddle.nn.initializer.Assign(self.std.numpy()),
+            )
         if device is not None:
             self.mean._to(device)
             self.std._to(device)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 82b88765d936c..b2c27a601a306 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -20,8 +20,7 @@
 import numpy as np
 import paddle
 from packaging import version
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -67,41 +66,38 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: SafeStableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: SafeStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__()
         safety_concept: Optional[str] = (
             "an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity,"
             " bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child"
-            " abuse, brutality, cruelty")
+            " abuse, brutality, cruelty"
+        )
 
-        if (hasattr(scheduler.config, "steps_offset") and
-                scheduler.config.steps_offset != 1):
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
                 " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate(
-                "steps_offset!=1",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if (hasattr(scheduler.config, "clip_sample") and
-                scheduler.config.clip_sample is True):
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -109,11 +105,7 @@ def __init__(
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate(
-                "clip_sample not set",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -134,12 +126,10 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(
-            unet.config, "_ppdiffusers_version") and version.parse(
-                version.parse(unet.config._ppdiffusers_version)
-                .base_version) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and
-                                       unet.config.sample_size < 64)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
+            version.parse(unet.config._ppdiffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -150,12 +140,9 @@ def __init__(
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file")
-            deprecate(
-                "sample_size<64",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -167,9 +154,10 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
             safety_checker=safety_checker,
-            feature_extractor=feature_extractor, )
+            feature_extractor=feature_extractor,
+        )
         self._safety_text_concept = safety_concept
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @property
@@ -194,12 +182,13 @@ def safety_concept(self, concept):
         self._safety_text_concept = concept
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            enable_safety_guidance, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        enable_safety_guidance,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -221,35 +210,35 @@ def _encode_prompt(
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(
-            prompt, padding="longest", return_tensors="pd").input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
 
-        if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                self.text_encoder.config.use_attention_mask):
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
             attention_mask = text_inputs.attention_mask
         else:
             attention_mask = None
 
         prompt_embeds = self.text_encoder(
             text_input_ids,
-            attention_mask=attention_mask, )
+            attention_mask=attention_mask,
+        )
         prompt_embeds = prompt_embeds[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
@@ -259,14 +248,16 @@ def _encode_prompt(
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -276,25 +267,24 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
+                attention_mask=attention_mask,
+            )
             negative_prompt_embeds = negative_prompt_embeds[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # Encode the safety concept text
             if enable_safety_guidance:
@@ -303,40 +293,35 @@ def _encode_prompt(
                     padding="max_length",
                     max_length=max_length,
                     truncation=True,
-                    return_tensors="pd", )
-                safety_embeddings = self.text_encoder(
-                    safety_concept_input.input_ids)[0]
+                    return_tensors="pd",
+                )
+                safety_embeddings = self.text_encoder(safety_concept_input.input_ids)[0]
 
                 # duplicate safety embeddings for each generation per prompt, using mps friendly method
                 seq_len = safety_embeddings.shape[1]
-                safety_embeddings = safety_embeddings.tile(
-                    [batch_size, num_images_per_prompt, 1])
-                safety_embeddings = safety_embeddings.reshape(
-                    [batch_size * num_images_per_prompt, seq_len, -1])
+                safety_embeddings = safety_embeddings.tile([batch_size, num_images_per_prompt, 1])
+                safety_embeddings = safety_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
                 # For classifier free guidance + sld, we need to do three forward passes.
                 # Here we concatenate the unconditional and text embeddings into a single batch
                 # to avoid doing three forward passes
-                prompt_embeds = paddle.concat(
-                    [negative_prompt_embeds, prompt_embeds, safety_embeddings])
+                prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds, safety_embeddings])
 
             else:
                 # For classifier free guidance, we need to do two forward passes.
                 # Here we concatenate the unconditional and text embeddings into a single batch
                 # to avoid doing two forward passes
-                prompt_embeds = paddle.concat(
-                    [negative_prompt_embeds, prompt_embeds])
+                prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     def run_safety_checker(self, image, dtype, enable_safety_guidance):
         if self.safety_checker is not None:
             images = image.copy()
-            safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="pd")
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
             image, has_nsfw_concept = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype))
+                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
+            )
             flagged_images = np.zeros((2, *image.shape[1:]))
             if any(has_nsfw_concept):
                 logger.warning(
@@ -369,54 +354,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -429,23 +410,26 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -460,49 +444,48 @@ def prepare_latents(
         return latents
 
     def perform_safety_guidance(
-            self,
-            enable_safety_guidance,
-            safety_momentum,
-            noise_guidance,
-            noise_pred_out,
-            i,
-            sld_guidance_scale,
-            sld_warmup_steps,
-            sld_threshold,
-            sld_momentum_scale,
-            sld_mom_beta, ):
+        self,
+        enable_safety_guidance,
+        safety_momentum,
+        noise_guidance,
+        noise_pred_out,
+        i,
+        sld_guidance_scale,
+        sld_warmup_steps,
+        sld_threshold,
+        sld_momentum_scale,
+        sld_mom_beta,
+    ):
         # Perform SLD guidance
         if enable_safety_guidance:
             if safety_momentum is None:
                 safety_momentum = paddle.zeros_like(noise_guidance)
-            noise_pred_text, noise_pred_uncond = noise_pred_out[
-                0], noise_pred_out[1]
+            noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1]
             noise_pred_safety_concept = noise_pred_out[2]
 
             # Equation 6
             scale = paddle.clip(
-                paddle.abs((noise_pred_text - noise_pred_safety_concept)) *
-                sld_guidance_scale,
-                max=1.0, )
+                paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale,
+                max=1.0,
+            )
 
             # Equation 6
             safety_concept_scale = paddle.where(
                 (noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
                 paddle.zeros_like(scale),
-                scale, )
+                scale,
+            )
 
             # Equation 4
             noise_guidance_safety = paddle.multiply(
-                (noise_pred_safety_concept - noise_pred_uncond),
-                safety_concept_scale)
+                (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
+            )
 
             # Equation 7
-            noise_guidance_safety = (
-                noise_guidance_safety + sld_momentum_scale * safety_momentum)
+            noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
 
             # Equation 8
-            safety_momentum = (sld_mom_beta * safety_momentum +
-                               (1 - sld_mom_beta) * noise_guidance_safety)
+            safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
 
             if i >= sld_warmup_steps:  # Warmup
                 # Equation 3
@@ -511,27 +494,27 @@ def perform_safety_guidance(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            sld_guidance_scale: Optional[float]=1000,
-            sld_warmup_steps: Optional[int]=10,
-            sld_threshold: Optional[float]=0.01,
-            sld_momentum_scale: Optional[float]=0.3,
-            sld_mom_beta: Optional[float]=0.4, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        sld_guidance_scale: Optional[float] = 1000,
+        sld_warmup_steps: Optional[int] = 10,
+        sld_threshold: Optional[float] = 0.01,
+        sld_momentum_scale: Optional[float] = 0.3,
+        sld_mom_beta: Optional[float] = 0.4,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -620,8 +603,7 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        enable_safety_guidance = (sld_guidance_scale > 1.0 and
-                                  do_classifier_free_guidance)
+        enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance
         if not enable_safety_guidance:
             warnings.warn("Safety checker disabled!")
 
@@ -631,7 +613,8 @@ def __call__(
             num_images_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
-            enable_safety_guidance, )
+            enable_safety_guidance,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -646,36 +629,35 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         safety_momentum = None
 
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat(
-                    [latents] * (3 if enable_safety_guidance else 2)) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = (
+                    paddle.concat([latents] * (3 if enable_safety_guidance else 2))
+                    if do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input, t,
-                    encoder_hidden_states=prompt_embeds).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_out = noise_pred.chunk(
-                        (3 if enable_safety_guidance else 2))
+                    noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2))
                     noise_pred_uncond, noise_pred_text = (
                         noise_pred_out[0],
-                        noise_pred_out[1], )
+                        noise_pred_out[1],
+                    )
 
                     # default classifier free guidance
                     noise_guidance = noise_pred_text - noise_pred_uncond
@@ -688,32 +670,28 @@ def __call__(
 
                         # Equation 6
                         scale = paddle.clip(
-                            paddle.abs(
-                                (noise_pred_text - noise_pred_safety_concept)) *
-                            sld_guidance_scale,
-                            max=1.0, )
+                            paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale,
+                            max=1.0,
+                        )
 
                         # Equation 6
                         safety_concept_scale = paddle.where(
-                            (noise_pred_text - noise_pred_safety_concept) >=
-                            sld_threshold,
+                            (noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
                             paddle.zeros_like(scale),
-                            scale, )
+                            scale,
+                        )
 
                         # Equation 4
                         noise_guidance_safety = paddle.multiply(
                             (noise_pred_safety_concept - noise_pred_uncond),
-                            safety_concept_scale, )
+                            safety_concept_scale,
+                        )
 
                         # Equation 7
-                        noise_guidance_safety = (
-                            noise_guidance_safety + sld_momentum_scale *
-                            safety_momentum)
+                        noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
 
                         # Equation 8
-                        safety_momentum = (
-                            sld_mom_beta * safety_momentum +
-                            (1 - sld_mom_beta) * noise_guidance_safety)
+                        safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
 
                         if i >= sld_warmup_steps:  # Warmup
                             # Equation 3
@@ -722,13 +700,10 @@ def __call__(
                     noise_pred = noise_pred_uncond + guidance_scale * noise_guidance
 
                     # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -738,7 +713,8 @@ def __call__(
 
         # 9. Run safety checker
         image, has_nsfw_concept, flagged_images = self.run_safety_checker(
-            image, prompt_embeds.dtype, enable_safety_guidance)
+            image, prompt_embeds.dtype, enable_safety_guidance
+        )
 
         # 10. Convert to PIL
         if output_type == "pil":
@@ -751,11 +727,12 @@ def __call__(
                 image,
                 has_nsfw_concept,
                 self._safety_text_concept if enable_safety_guidance else None,
-                flagged_images, )
+                flagged_images,
+            )
 
         return StableDiffusionSafePipelineOutput(
             images=image,
             nsfw_content_detected=has_nsfw_concept,
-            applied_safety_concept=self._safety_text_concept
-            if enable_safety_guidance else None,
-            unsafe_images=flagged_images, )
+            applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None,
+            unsafe_images=flagged_images,
+        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
index ceae2727162f5..43772eac7c2cb 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
@@ -15,8 +15,11 @@
 
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig,
-                                    CLIPVisionModel)
+from paddlenlp.transformers import (
+    CLIPPretrainedModel,
+    CLIPVisionConfig,
+    CLIPVisionModel,
+)
 
 from ...utils import logging
 
@@ -26,8 +29,7 @@
 def cosine_distance(image_embeds, text_embeds):
     normalized_image_embeds = F.normalize(image_embeds)
     normalized_text_embeds = F.normalize(text_embeds)
-    return paddle.matmul(
-        normalized_image_embeds, normalized_text_embeds, transpose_y=True)
+    return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True)
 
 
 class SafeStableDiffusionSafetyChecker(CLIPPretrainedModel):
@@ -39,12 +41,11 @@ def __init__(self, config: CLIPVisionConfig):
 
         self.vision_projection = paddle.create_parameter(
             (config.hidden_size, config.projection_dim),
-            dtype=paddle.get_default_dtype(), )
+            dtype=paddle.get_default_dtype(),
+        )
 
-        self.register_buffer("concept_embeds",
-                             paddle.ones([17, config.projection_dim]))
-        self.register_buffer("special_care_embeds",
-                             paddle.ones([3, config.projection_dim]))
+        self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim]))
+        self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim]))
 
         self.register_buffer("concept_embeds_weights", paddle.ones([17]))
         self.register_buffer("special_care_embeds_weights", paddle.ones([3]))
@@ -55,11 +56,8 @@ def forward(self, clip_input, images):
         image_embeds = paddle.matmul(pooled_output, self.vision_projection)
 
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        special_cos_dist = (
-            cosine_distance(image_embeds, self.special_care_embeds)
-            .astype("float32").numpy())
-        cos_dist = (cosine_distance(
-            image_embeds, self.concept_embeds).astype("float32").numpy())
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy()
 
         result = []
         batch_size = image_embeds.shape[0]
@@ -77,22 +75,16 @@ def forward(self, clip_input, images):
 
             for concept_idx in range(len(special_cos_dist[0])):
                 concept_cos = special_cos_dist[i][concept_idx]
-                concept_threshold = self.special_care_embeds_weights[
-                    concept_idx].item()
-                result_img["special_scores"][concept_idx] = round(
-                    concept_cos - concept_threshold + adjustment, 3)
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
                 if result_img["special_scores"][concept_idx] > 0:
-                    result_img["special_care"].append({
-                        concept_idx, result_img["special_scores"][concept_idx]
-                    })
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
                     adjustment = 0.01
 
             for concept_idx in range(len(cos_dist[0])):
                 concept_cos = cos_dist[i][concept_idx]
-                concept_threshold = self.concept_embeds_weights[
-                    concept_idx].item()
-                result_img["concept_scores"][concept_idx] = round(
-                    concept_cos - concept_threshold + adjustment, 3)
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
                 if result_img["concept_scores"][concept_idx] > 0:
                     result_img["bad_concepts"].append(concept_idx)
 
@@ -102,30 +94,24 @@ def forward(self, clip_input, images):
 
         return images, has_nsfw_concepts
 
-    def forward_fastdeploy(self,
-                           clip_input: paddle.Tensor,
-                           images: paddle.Tensor):
+    def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor):
         pooled_output = self.clip(clip_input)[1]  # pooled_output
         image_embeds = paddle.matmul(pooled_output, self.vision_projection)
 
-        special_cos_dist = cosine_distance(image_embeds,
-                                           self.special_care_embeds)
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
         cos_dist = cosine_distance(image_embeds, self.concept_embeds)
 
         # increase this value to create a stronger `nsfw` filter
         # at the cost of increasing the possibility of filtering benign images
         adjustment = 0.0
 
-        special_scores = (
-            special_cos_dist - self.special_care_embeds_weights + adjustment)
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
         # special_scores = special_scores.round(decimals=3)
         special_care = paddle.any(special_scores > 0, axis=1)
         special_adjustment = special_care * 0.01
-        special_adjustment = special_adjustment.unsqueeze(1).expand(
-            [-1, cos_dist.shape[1]])
+        special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]])
 
-        concept_scores = (cos_dist - self.concept_embeds_weights
-                          ) + special_adjustment
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
         # concept_scores = concept_scores.round(decimals=3)
         has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1)
 
diff --git a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
index acd0aad93d9ee..d06ace3696225 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -48,14 +48,14 @@ def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            num_inference_steps: int=50,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            **kwargs, ) -> Union[Tuple, ImagePipelineOutput]:
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
         r"""
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -82,8 +82,7 @@ def __call__(
         model = self.unet
 
         # sample x_0 ~ N(0, sigma_0^2 * I)
-        sample = (randn_tensor(
-            shape, generator=generator) * self.scheduler.init_noise_sigma)
+        sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
 
         self.scheduler.set_timesteps(num_inference_steps)
 
@@ -94,31 +93,28 @@ def __call__(
 
             # 1. Select temporarily increased noise level sigma_hat
             # 2. Add new noise to move from sample_i to sample_hat
-            sample_hat, sigma_hat = self.scheduler.add_noise_to_input(
-                sample, sigma, generator=generator)
+            sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator)
 
             # 3. Predict the noise residual given the noise magnitude `sigma_hat`
             # The model inputs and output are adjusted by following eq. (213) in [1].
-            model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2,
-                                                   sigma_hat / 2).sample
+            model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample
 
             # 4. Evaluate dx/dt at sigma_hat
             # 5. Take Euler step from sigma to sigma_prev
-            step_output = self.scheduler.step(model_output, sigma_hat,
-                                              sigma_prev, sample_hat)
+            step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat)
 
             if sigma_prev != 0:
                 # 6. Apply 2nd order correction
                 # The model inputs and output are adjusted by following eq. (213) in [1].
-                model_output = (sigma_prev / 2) * model(
-                    (step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
+                model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
                 step_output = self.scheduler.step_correct(
                     model_output,
                     sigma_hat,
                     sigma_prev,
                     sample_hat,
                     step_output.prev_sample,
-                    step_output["derivative"], )
+                    step_output["derivative"],
+                )
             sample = step_output.prev_sample
 
         sample = (sample / 2 + 0.5).clip(0, 1)
@@ -127,6 +123,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
index 2ab0f9892a8b6..649c39a7ecdad 100644
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -19,8 +19,12 @@
 import numpy as np
 import paddle
 
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
-                      is_paddle_available, is_paddlenlp_available)
+from ...utils import (
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 
 @dataclass
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index de047ee797c85..8ecc3b2759f33 100644
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -47,8 +47,7 @@
 """
 
 
-def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5],
-               std=[0.5, 0.5, 0.5]) -> List[np.ndarray]:
+def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> List[np.ndarray]:
     # This code is copied from https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
     # reshape to ncfhw
     mean = paddle.to_tensor(mean).reshape((1, -1, 1, 1, 1))
@@ -85,29 +84,32 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet3DConditionModel,
-            scheduler: KarrasDiffusionSchedulers, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
         super().__init__()
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
-            scheduler=scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         """
         Encodes the prompt into text encoder hidden states.
 
@@ -145,32 +147,30 @@ def _encode_prompt(
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
-            prompt_embeds = self.text_encoder(
-                text_input_ids, attention_mask=attention_mask)
+            prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
             prompt_embeds = prompt_embeds[0]
         prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            (bs_embed * num_images_per_prompt, seq_len, -1))
+        prompt_embeds = prompt_embeds.reshape((bs_embed * num_images_per_prompt, seq_len, -1))
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
@@ -191,48 +191,41 @@ def _encode_prompt(
 
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-                                                          self.tokenizer)
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+                return_tensors="pd",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids, attention_mask=attention_mask)
+            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)
             negative_prompt_embeds = negative_prompt_embeds[0]
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.cast(
-                self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                (batch_size * num_images_per_prompt, seq_len, -1))
+            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1))
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
         return prompt_embeds
 
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         batch_size, channels, num_frames, height, width = latents.shape
-        latents = latents.transpose([0, 2, 1, 3, 4]).reshape(
-            (batch_size * num_frames, channels, height, width))
+        latents = latents.transpose([0, 2, 1, 3, 4]).reshape((batch_size * num_frames, channels, height, width))
         image = self.vae.decode(latents).sample
-        video = (image[None, :]
-                 .reshape((batch_size, num_frames, -1) + tuple(image.shape[2:]))
-                 .transpose([0, 2, 1, 3, 4]))
+        video = (
+            image[None, :].reshape((batch_size, num_frames, -1) + tuple(image.shape[2:])).transpose([0, 2, 1, 3, 4])
+        )
         video = video.cast("float32")
         return video
 
@@ -241,33 +234,33 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
-        if (callback_steps is None or callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (
+            callback_steps is None
+            or callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
             )
@@ -279,11 +272,8 @@ def check_inputs(
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
@@ -295,21 +285,23 @@ def check_inputs(
                 )
 
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            num_frames,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             num_frames,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
@@ -323,25 +315,25 @@ def prepare_latents(
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
-            self,
-            prompt: Union[str, List[str]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_frames: int=16,
-            num_inference_steps: int=50,
-            guidance_scale: float=9.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="np",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: int=1,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None, ):
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -423,7 +415,8 @@ def __call__(
             callback_steps,
             negative_prompt,
             prompt_embeds,
-            negative_prompt_embeds, )
+            negative_prompt_embeds,
+        )
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -442,7 +435,8 @@ def __call__(
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds, )
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -459,48 +453,38 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat([latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(
-                        chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # reshape latents
                 bsz, channel, frames, width, height = latents.shape
-                latents = latents.transpose([0, 2, 1, 3, 4]).reshape(
-                    (bsz * frames, channel, width, height))
-                noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape(
-                    (bsz * frames, channel, width, height))
+                latents = latents.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height))
+                noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height))
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
-                latents = (latents[None, :].reshape(
-                    (bsz, frames, channel, width, height))
-                           .transpose([0, 2, 1, 3, 4]))
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = latents[None, :].reshape((bsz, frames, channel, width, height)).transpose([0, 2, 1, 3, 4])
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -510,5 +494,5 @@ def __call__(
         else:
             video = tensor2vid(video_tensor)
         if not return_dict:
-            return (video, )
+            return (video,)
         return TextToVideoSDPipelineOutput(frames=video)
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
index 5f9ccbe235000..106382dceb106 100644
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -20,31 +20,26 @@
 import paddle
 import paddle.nn.functional as F
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
 from ppdiffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipeline, StableDiffusionSafetyChecker)
+    StableDiffusionPipeline,
+    StableDiffusionSafetyChecker,
+)
 from ppdiffusers.schedulers import KarrasDiffusionSchedulers
 from ppdiffusers.utils import BaseOutput
 
 
 def rearrange_0(tensor, f):
     F, C, H, W = tensor.shape
-    tensor = paddle.transpose(
-        x=paddle.reshape(
-            x=tensor, shape=(F // f, f, C, H, W)),
-        perm=(0, 2, 1, 3, 4))
+    tensor = paddle.transpose(x=paddle.reshape(x=tensor, shape=(F // f, f, C, H, W)), perm=(0, 2, 1, 3, 4))
     return tensor
 
 
 def rearrange_1(tensor):
     B, C, F, H, W = tensor.shape
-    return paddle.reshape(
-        x=paddle.transpose(
-            x=tensor, perm=(0, 2, 1, 3, 4)),
-        shape=(B * F, C, H, W))
+    return paddle.reshape(x=paddle.transpose(x=tensor, perm=(0, 2, 1, 3, 4)), shape=(B * F, C, H, W))
 
 
 def rearrange_3(tensor, f):
@@ -70,21 +65,15 @@ class CrossFrameAttnProcessor:
     def __init__(self, batch_size=2):
         self.batch_size = batch_size
 
-    def __call__(self,
-                 attn,
-                 hidden_states,
-                 encoder_hidden_states=None,
-                 attention_mask=None):
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
         batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(
-            attention_mask, sequence_length, batch_size)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         query = attn.to_q(hidden_states)
         is_cross_attention = encoder_hidden_states is not None
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(
-                encoder_hidden_states)
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
@@ -144,10 +133,10 @@ def warp_single_latent(latent, reference_flow):
     if isinstance(latent.dtype, paddle.dtype):
         dtype = latent.dtype
     elif isinstance(latent.dtype, str) and latent.dtype not in [
-            "cpu",
-            "cuda",
-            "ipu",
-            "xpu",
+        "cpu",
+        "cuda",
+        "ipu",
+        "xpu",
     ]:
         dtype = latent.dtype
     elif isinstance(latent.dtype, paddle.Tensor):
@@ -161,13 +150,11 @@ def warp_single_latent(latent, reference_flow):
     coords_t0 = coords_t0 * 2.0 - 1.0
     coords_t0 = F.interpolate(x=coords_t0, size=(h, w), mode="bilinear")
     coords_t0 = paddle.transpose(x=coords_t0, perm=(0, 2, 3, 1))
-    warped = F.grid_sample(
-        x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection")
+    warped = F.grid_sample(x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection")
     return warped
 
 
-def create_motion_field(motion_field_strength_x, motion_field_strength_y,
-                        frame_ids, dtype):
+def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, dtype):
     """
     Create translation motion field
 
@@ -184,15 +171,12 @@ def create_motion_field(motion_field_strength_x, motion_field_strength_y,
     seq_length = len(frame_ids)
     reference_flow = paddle.zeros(shape=(seq_length, 2, 512, 512), dtype=dtype)
     for fr_idx in range(seq_length):
-        reference_flow[(fr_idx), (0), :, :] = (motion_field_strength_x *
-                                               frame_ids[fr_idx])
-        reference_flow[(fr_idx), (1), :, :] = (motion_field_strength_y *
-                                               frame_ids[fr_idx])
+        reference_flow[(fr_idx), (0), :, :] = motion_field_strength_x * frame_ids[fr_idx]
+        reference_flow[(fr_idx), (1), :, :] = motion_field_strength_y * frame_ids[fr_idx]
     return reference_flow
 
 
-def create_motion_field_and_warp_latents(
-        motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
+def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
     """
     Creates translation motion and warps the latents accordingly
 
@@ -210,11 +194,11 @@ def create_motion_field_and_warp_latents(
         motion_field_strength_x=motion_field_strength_x,
         motion_field_strength_y=motion_field_strength_y,
         frame_ids=frame_ids,
-        dtype=latents.dtype, )
+        dtype=latents.dtype,
+    )
     warped_latents = latents.clone().detach()
     for i in range(len(warped_latents)):
-        warped_latents[i] = warp_single_latent(latents[i][None],
-                                               motion_field[i][None])
+        warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
     return warped_latents
 
 
@@ -244,15 +228,16 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
     """
 
     def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: KarrasDiffusionSchedulers,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool=True, ):
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
         super().__init__(
             vae,
             text_encoder,
@@ -261,7 +246,8 @@ def __init__(
             scheduler,
             safety_checker,
             feature_extractor,
-            requires_safety_checker, )
+            requires_safety_checker,
+        )
         self.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
 
     def forward_loop(self, x_t0, t0, t1, generator):
@@ -277,24 +263,23 @@ def forward_loop(self, x_t0, t0, t1, generator):
         Returns:
             x_t1: forward process applied to x_t0 from time t0 to t1.
         """
-        eps = paddle.randn(
-            shape=x_t0.shape, generator=generator, dtype=x_t0.dtype)
+        eps = paddle.randn(shape=x_t0.shape, generator=generator, dtype=x_t0.dtype)
         alpha_vec = paddle.prod(x=self.scheduler.alphas[t0:t1])
-        x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 -
-                                                             alpha_vec) * eps
+        x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 - alpha_vec) * eps
         return x_t1
 
     def backward_loop(
-            self,
-            latents,
-            timesteps,
-            prompt_embeds,
-            guidance_scale,
-            callback,
-            callback_steps,
-            num_warmup_steps,
-            extra_step_kwargs,
-            cross_attention_kwargs=None, ):
+        self,
+        latents,
+        timesteps,
+        prompt_embeds,
+        guidance_scale,
+        callback,
+        callback_steps,
+        num_warmup_steps,
+        extra_step_kwargs,
+        cross_attention_kwargs=None,
+    ):
         """
         Perform backward process given list of time steps
 
@@ -326,32 +311,27 @@ def backward_loop(
         with self.progress_bar(total=num_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = (paddle.concat(x=[latents] * 2) if
-                                      do_classifier_free_guidance else latents)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
+                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(
-                        chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -359,27 +339,27 @@ def backward_loop(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            video_length: Optional[int]=8,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_videos_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            motion_field_strength_x: float=12,
-            motion_field_strength_y: float=12,
-            output_type: Optional[str]="tensor",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            t0: int=44,
-            t1: int=47, ):
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int] = 8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        t0: int = 44,
+        t1: int = 47,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -471,12 +451,14 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # Encode input prompt
-        prompt_embeds = self._encode_prompt(prompt, num_videos_per_prompt,
-                                            do_classifier_free_guidance,
-                                            negative_prompt)
+        prompt_embeds = self._encode_prompt(
+            prompt, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, )
+        self.scheduler.set_timesteps(
+            num_inference_steps,
+        )
         timesteps = self.scheduler.timesteps
 
         # Prepare latent variables
@@ -488,35 +470,37 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         # Perform the first backward process up to time T_1
         x_1_t1 = self.backward_loop(
-            timesteps=timesteps[:-t1 - 1],
+            timesteps=timesteps[: -t1 - 1],
             prompt_embeds=prompt_embeds,
             latents=latents,
             guidance_scale=guidance_scale,
             callback=callback,
             callback_steps=callback_steps,
             extra_step_kwargs=extra_step_kwargs,
-            num_warmup_steps=num_warmup_steps, )
+            num_warmup_steps=num_warmup_steps,
+        )
         scheduler_copy = copy.deepcopy(self.scheduler)
 
         # Perform the second backward process up to time T_0
         x_1_t0 = self.backward_loop(
-            timesteps=timesteps[-t1 - 1:-t0 - 1],
+            timesteps=timesteps[-t1 - 1 : -t0 - 1],
             prompt_embeds=prompt_embeds,
             latents=x_1_t1,
             guidance_scale=guidance_scale,
             callback=callback,
             callback_steps=callback_steps,
             extra_step_kwargs=extra_step_kwargs,
-            num_warmup_steps=0, )
+            num_warmup_steps=0,
+        )
 
         # Propagate first frame latents at time T_0 to remaining frames
         x_2k_t0 = x_1_t0.tile(repeat_times=[video_length - 1, 1, 1, 1])
@@ -526,31 +510,34 @@ def __call__(
             motion_field_strength_x=motion_field_strength_x,
             motion_field_strength_y=motion_field_strength_y,
             latents=x_2k_t0,
-            frame_ids=frame_ids[1:], )
+            frame_ids=frame_ids[1:],
+        )
 
         # Perform forward process up to time T_1
         x_2k_t1 = self.forward_loop(
             x_t0=x_2k_t0,
             t0=timesteps[-t0 - 1].item(),
             t1=timesteps[-t1 - 1].item(),
-            generator=generator, )
+            generator=generator,
+        )
 
         # Perform backward process from time T_1 to 0
         x_1k_t1 = paddle.concat(x=[x_1_t1, x_2k_t1])
         b, l, d = prompt_embeds.shape
-        prompt_embeds = (prompt_embeds[:, (None)]
-                         .tile(repeat_times=[1, video_length, 1, 1])
-                         .reshape([b * video_length, l, d]))
+        prompt_embeds = (
+            prompt_embeds[:, (None)].tile(repeat_times=[1, video_length, 1, 1]).reshape([b * video_length, l, d])
+        )
         self.scheduler = scheduler_copy
         x_1k_0 = self.backward_loop(
-            timesteps=timesteps[-t1 - 1:],
+            timesteps=timesteps[-t1 - 1 :],
             prompt_embeds=prompt_embeds,
             latents=x_1k_t1,
             guidance_scale=guidance_scale,
             callback=callback,
             callback_steps=callback_steps,
             extra_step_kwargs=extra_step_kwargs,
-            num_warmup_steps=0, )
+            num_warmup_steps=0,
+        )
         latents = x_1k_0
         paddle.device.cuda.empty_cache()
         if output_type == "latent":
@@ -558,9 +545,7 @@ def __call__(
             has_nsfw_concept = None
         else:
             image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, prompt_embeds.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
         if not return_dict:
             return image, has_nsfw_concept
-        return TextToVideoPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept)
+        return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
index 90e39132e944b..4fa798729384f 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
@@ -12,15 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
-                      is_paddlenlp_available)
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 try:
     if not (is_paddlenlp_available() and is_paddle_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_and_paddlenlp_objects import (
-        UnCLIPImageVariationPipeline, UnCLIPPipeline)
+        UnCLIPImageVariationPipeline,
+        UnCLIPPipeline,
+    )
 else:
     from .pipeline_unclip import UnCLIPPipeline
     from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
index 4c591a6c434cb..9f9d905244ac2 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
@@ -75,17 +75,18 @@ class UnCLIPPipeline(DiffusionPipeline):
     super_res_scheduler: UnCLIPScheduler
 
     def __init__(
-            self,
-            prior: PriorTransformer,
-            decoder: UNet2DConditionModel,
-            text_encoder: CLIPTextModelWithProjection,
-            tokenizer: CLIPTokenizer,
-            text_proj: UnCLIPTextProjModel,
-            super_res_first: UNet2DModel,
-            super_res_last: UNet2DModel,
-            prior_scheduler: UnCLIPScheduler,
-            decoder_scheduler: UnCLIPScheduler,
-            super_res_scheduler: UnCLIPScheduler, ):
+        self,
+        prior: PriorTransformer,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        prior_scheduler: UnCLIPScheduler,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -98,27 +99,27 @@ def __init__(
             super_res_last=super_res_last,
             prior_scheduler=prior_scheduler,
             decoder_scheduler=decoder_scheduler,
-            super_res_scheduler=super_res_scheduler, )
+            super_res_scheduler=super_res_scheduler,
+        )
 
     def prepare_latents(self, shape, dtype, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, dtype=dtype)
         else:
             if latents.shape != list(shape):
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
         latents = latents * scheduler.init_noise_sigma
         return latents
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None,
-            text_attention_mask: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[paddle.Tensor] = None,
+    ):
         if text_model_output is None:
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             # get prompt text embeddings
@@ -128,23 +129,24 @@ def _encode_prompt(
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
                 return_attention_mask=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_input_ids = text_inputs.input_ids
             text_mask = text_inputs.attention_mask
 
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not paddle.equal_all(text_input_ids,
-                                                 untruncated_ids):
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+                text_input_ids, untruncated_ids
+            ):
                 removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-                text_input_ids = text_input_ids[:, :
-                                                self.tokenizer.model_max_length]
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
 
             text_encoder_output = self.text_encoder(text_input_ids)
 
@@ -155,27 +157,26 @@ def _encode_prompt(
             batch_size = text_model_output[0].shape[0]
             prompt_embeds, text_encoder_hidden_states = (
                 text_model_output[0],
-                text_model_output[1], )
+                text_model_output[1],
+            )
             text_mask = text_attention_mask
 
         # duplicate text embeddings for each generation per prompt
         seq_len = prompt_embeds.shape[1]
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt])
-        prompt_embeds = prompt_embeds.reshape(
-            [batch_size * num_images_per_prompt, seq_len])
+        prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
 
         # duplicate text_encoder_hidden_states for each generation per prompt
         seq_len = text_encoder_hidden_states.shape[1]
-        text_encoder_hidden_states = text_encoder_hidden_states.tile(
-            [1, num_images_per_prompt, 1])
+        text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
         text_encoder_hidden_states = text_encoder_hidden_states.reshape(
-            [batch_size * num_images_per_prompt, seq_len, -1])
+            [batch_size * num_images_per_prompt, seq_len, -1]
+        )
 
         # duplicate text_mask for each generation per prompt
         seq_len = text_mask.shape[1]
         text_mask = text_mask.tile([1, num_images_per_prompt])
-        text_mask = text_mask.reshape(
-            [batch_size * num_images_per_prompt, seq_len])
+        text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
 
         # prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
         # text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0)
@@ -190,47 +191,38 @@ def _encode_prompt(
                 max_length=self.tokenizer.model_max_length,
                 return_attention_mask=True,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             uncond_text_mask = uncond_input.attention_mask
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(
-                uncond_input.input_ids)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids)
 
-            negative_prompt_embeds = (
-                negative_prompt_embeds_text_encoder_output.text_embeds)
-            uncond_text_encoder_hidden_states = (
-                negative_prompt_embeds_text_encoder_output.last_hidden_state)
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
 
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
 
             seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile(
-                [1, num_images_per_prompt, 1])
-            uncond_text_encoder_hidden_states = (
-                uncond_text_encoder_hidden_states.reshape(
-                    [batch_size * num_images_per_prompt, seq_len, -1]))
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape(
+                [batch_size * num_images_per_prompt, seq_len, -1]
+            )
 
             # duplicate uncond_text_mask for each generation per prompt
             seq_len = uncond_text_mask.shape[1]
             uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt])
-            uncond_text_mask = uncond_text_mask.reshape(
-                [batch_size * num_images_per_prompt, seq_len])
+            uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
             # uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
             # done duplicates
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
-            text_encoder_hidden_states = paddle.concat([
-                uncond_text_encoder_hidden_states, text_encoder_hidden_states
-            ])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
 
             text_mask = paddle.concat([uncond_text_mask, text_mask])
 
@@ -238,23 +230,23 @@ def _encode_prompt(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: int=1,
-            prior_num_inference_steps: int=25,
-            decoder_num_inference_steps: int=25,
-            super_res_num_inference_steps: int=7,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prior_latents: Optional[paddle.Tensor]=None,
-            decoder_latents: Optional[paddle.Tensor]=None,
-            super_res_latents: Optional[paddle.Tensor]=None,
-            text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None,
-            text_attention_mask: Optional[paddle.Tensor]=None,
-            prior_guidance_scale: float=4.0,
-            decoder_guidance_scale: float=8.0,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ):
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        prior_num_inference_steps: int = 25,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prior_latents: Optional[paddle.Tensor] = None,
+        decoder_latents: Optional[paddle.Tensor] = None,
+        super_res_latents: Optional[paddle.Tensor] = None,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[paddle.Tensor] = None,
+        prior_guidance_scale: float = 4.0,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -312,23 +304,21 @@ def __call__(
             elif isinstance(prompt, list):
                 batch_size = len(prompt)
             else:
-                raise ValueError(
-                    f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-                )
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         else:
             batch_size = text_model_output[0].shape[0]
 
         batch_size = batch_size * num_images_per_prompt
 
-        do_classifier_free_guidance = (prior_guidance_scale > 1.0 or
-                                       decoder_guidance_scale > 1.0)
+        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
 
         prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
             prompt,
             num_images_per_prompt,
             do_classifier_free_guidance,
             text_model_output,
-            text_attention_mask, )
+            text_attention_mask,
+        )
 
         # prior
 
@@ -342,30 +332,29 @@ def __call__(
             prompt_embeds.dtype,
             generator,
             prior_latents,
-            self.prior_scheduler, )
+            self.prior_scheduler,
+        )
 
         for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([prior_latents] * 2)
-                                  if do_classifier_free_guidance else
-                                  prior_latents)
+            latent_model_input = paddle.concat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
 
             predicted_image_embedding = self.prior(
                 latent_model_input,
                 timestep=t,
                 proj_embedding=prompt_embeds,
                 encoder_hidden_states=text_encoder_hidden_states,
-                attention_mask=text_mask, ).predicted_image_embedding
+                attention_mask=text_mask,
+            ).predicted_image_embedding
 
             if do_classifier_free_guidance:
                 (
                     predicted_image_embedding_uncond,
                     predicted_image_embedding_text,
                 ) = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = (
-                    predicted_image_embedding_uncond + prior_guidance_scale *
-                    (predicted_image_embedding_text -
-                     predicted_image_embedding_uncond))
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
 
             if i + 1 == prior_timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -377,7 +366,8 @@ def __call__(
                 timestep=t,
                 sample=prior_latents,
                 generator=generator,
-                prev_timestep=prev_timestep, ).prev_sample
+                prev_timestep=prev_timestep,
+            ).prev_sample
 
         prior_latents = self.prior.post_process_latents(prior_latents)
 
@@ -390,13 +380,15 @@ def __call__(
             image_embeddings=image_embeddings,
             prompt_embeds=prompt_embeds,
             text_encoder_hidden_states=text_encoder_hidden_states,
-            do_classifier_free_guidance=do_classifier_free_guidance, )
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
 
         decoder_text_mask = F.pad(
             text_mask.unsqueeze(0),
             (self.text_proj.clip_extra_context_tokens, 0),
             value=1,
-            data_format="NCL", ).squeeze(0)
+            data_format="NCL",
+        ).squeeze(0)
 
         self.decoder_scheduler.set_timesteps(decoder_num_inference_steps)
         decoder_timesteps_tensor = self.decoder_scheduler.timesteps
@@ -410,20 +402,22 @@ def __call__(
             text_encoder_hidden_states.dtype,
             generator,
             decoder_latents,
-            self.decoder_scheduler, )
+            self.decoder_scheduler,
+        )
 
         for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([decoder_latents] * 2)
-                                  if do_classifier_free_guidance else
-                                  decoder_latents)
+            latent_model_input = (
+                paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+            )
 
             noise_pred = self.decoder(
                 sample=latent_model_input,
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 class_labels=additive_clip_time_embeddings,
-                attention_mask=decoder_text_mask, ).sample
+                attention_mask=decoder_text_mask,
+            ).sample
 
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
@@ -431,20 +425,19 @@ def __call__(
                 noise_pred_uncond, _ = noise_pred_uncond.split(
                     [
                         latent_model_input.shape[1],
-                        noise_pred_uncond.shape[1] -
-                        latent_model_input.shape[1],
+                        noise_pred_uncond.shape[1] - latent_model_input.shape[1],
                     ],
-                    axis=1, )
+                    axis=1,
+                )
                 noise_pred_text, predicted_variance = noise_pred_text.split(
                     [
                         latent_model_input.shape[1],
                         noise_pred_text.shape[1] - latent_model_input.shape[1],
                     ],
-                    axis=1, )
-                noise_pred = noise_pred_uncond + decoder_guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
-                noise_pred = paddle.concat(
-                    [noise_pred, predicted_variance], axis=1)
+                    axis=1,
+                )
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
             if i + 1 == decoder_timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -457,7 +450,8 @@ def __call__(
                 t,
                 decoder_latents,
                 prev_timestep=prev_timestep,
-                generator=generator, ).prev_sample
+                generator=generator,
+            ).prev_sample
 
         decoder_latents = decoder_latents.clip(-1, 1)
 
@@ -479,7 +473,8 @@ def __call__(
             image_small.dtype,
             generator,
             super_res_latents,
-            self.super_res_scheduler, )
+            self.super_res_scheduler,
+        )
 
         interpolate_antialias = {}
         if "antialias" in inspect.signature(F.interpolate).parameters:
@@ -490,7 +485,8 @@ def __call__(
             size=[height, width],
             mode="bicubic",
             align_corners=False,
-            **interpolate_antialias, )
+            **interpolate_antialias,
+        )
 
         for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
             # no classifier free guidance
@@ -501,15 +497,14 @@ def __call__(
                 unet = self.super_res_first
 
             latent_model_input = paddle.concat(
-                [
-                    super_res_latents,
-                    image_upscaled.cast(super_res_latents.dtype)
-                ],
-                axis=1, )
+                [super_res_latents, image_upscaled.cast(super_res_latents.dtype)],
+                axis=1,
+            )
 
             noise_pred = unet(
                 sample=latent_model_input,
-                timestep=t, ).sample
+                timestep=t,
+            ).sample
 
             if i + 1 == super_res_timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -522,7 +517,8 @@ def __call__(
                 t,
                 super_res_latents,
                 prev_timestep=prev_timestep,
-                generator=generator, ).prev_sample
+                generator=generator,
+            ).prev_sample
 
         image = super_res_latents
         # done super res
@@ -537,6 +533,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
index ada35969b9c65..f303633b838ee 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -18,9 +18,12 @@
 import paddle
 import paddle.nn.functional as F
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor,
-                                    CLIPTextModelWithProjection, CLIPTokenizer,
-                                    CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from ...models import UNet2DConditionModel, UNet2DModel
 from ...pipelines import DiffusionPipeline, ImagePipelineOutput
@@ -78,17 +81,18 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline):
     super_res_scheduler: UnCLIPScheduler
 
     def __init__(
-            self,
-            decoder: UNet2DConditionModel,
-            text_encoder: CLIPTextModelWithProjection,
-            tokenizer: CLIPTokenizer,
-            text_proj: UnCLIPTextProjModel,
-            feature_extractor: CLIPImageProcessor,
-            image_encoder: CLIPVisionModelWithProjection,
-            super_res_first: UNet2DModel,
-            super_res_last: UNet2DModel,
-            decoder_scheduler: UnCLIPScheduler,
-            super_res_scheduler: UnCLIPScheduler, ):
+        self,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -101,7 +105,8 @@ def __init__(
             super_res_first=super_res_first,
             super_res_last=super_res_last,
             decoder_scheduler=decoder_scheduler,
-            super_res_scheduler=super_res_scheduler, )
+            super_res_scheduler=super_res_scheduler,
+        )
 
     # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, generator, latents, scheduler):
@@ -109,15 +114,12 @@ def prepare_latents(self, shape, dtype, generator, latents, scheduler):
             latents = randn_tensor(shape, generator=generator, dtype=dtype)
         else:
             if latents.shape != list(shape):
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    def _encode_prompt(self, prompt, num_images_per_prompt,
-                       do_classifier_free_guidance):
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         # get prompt text embeddings
@@ -126,7 +128,8 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             return_attention_mask=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
         text_mask = text_inputs.attention_mask
         text_encoder_output = self.text_encoder(text_input_ids)
@@ -137,21 +140,19 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
         # duplicate text embeddings for each generation per prompt
         seq_len = prompt_embeds.shape[1]
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt])
-        prompt_embeds = prompt_embeds.reshape(
-            [batch_size * num_images_per_prompt, seq_len])
+        prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
 
         # duplicate text_encoder_hidden_states for each generation per prompt
         seq_len = text_encoder_hidden_states.shape[1]
-        text_encoder_hidden_states = text_encoder_hidden_states.tile(
-            [1, num_images_per_prompt, 1])
+        text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
         text_encoder_hidden_states = text_encoder_hidden_states.reshape(
-            [batch_size * num_images_per_prompt, seq_len, -1])
+            [batch_size * num_images_per_prompt, seq_len, -1]
+        )
 
         # duplicate text_mask for each generation per prompt
         seq_len = text_mask.shape[1]
         text_mask = text_mask.tile([1, num_images_per_prompt])
-        text_mask = text_mask.reshape(
-            [batch_size * num_images_per_prompt, seq_len])
+        text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
 
         # prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
         # text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0)
@@ -167,91 +168,81 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
                 max_length=max_length,
                 return_attention_mask=True,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             uncond_text_mask = uncond_input.attention_mask
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(
-                uncond_input.input_ids)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids)
 
-            negative_prompt_embeds = (
-                negative_prompt_embeds_text_encoder_output.text_embeds)
-            uncond_text_encoder_hidden_states = (
-                negative_prompt_embeds_text_encoder_output.last_hidden_state)
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
 
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
 
             seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile(
-                [1, num_images_per_prompt, 1])
-            uncond_text_encoder_hidden_states = (
-                uncond_text_encoder_hidden_states.reshape(
-                    [batch_size * num_images_per_prompt, seq_len, -1]))
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape(
+                [batch_size * num_images_per_prompt, seq_len, -1]
+            )
 
             # duplicate uncond_text_mask for each generation per prompt
             seq_len = uncond_text_mask.shape[1]
             uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt])
-            uncond_text_mask = uncond_text_mask.reshape(
-                [batch_size * num_images_per_prompt, seq_len])
+            uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
             # uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
             # done duplicates
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
-            text_encoder_hidden_states = paddle.concat([
-                uncond_text_encoder_hidden_states, text_encoder_hidden_states
-            ])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
 
             text_mask = paddle.concat([uncond_text_mask, text_mask])
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
     def _encode_image(
-            self,
-            image,
-            num_images_per_prompt,
-            image_embeddings: Optional[paddle.Tensor]=None, ):
+        self,
+        image,
+        num_images_per_prompt,
+        image_embeddings: Optional[paddle.Tensor] = None,
+    ):
 
         dtype = self.image_encoder.dtype
 
         if image_embeddings is None:
             if not isinstance(image, paddle.Tensor):
-                image = self.feature_extractor(
-                    images=image, return_tensors="pd").pixel_values
+                image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
 
             image = image.cast(dtype)
             image_embeddings = self.image_encoder(image).image_embeds
 
         batch_size, seq_len = image_embeddings.shape
         image_embeddings = image_embeddings.tile([1, num_images_per_prompt])
-        image_embeddings = image_embeddings.reshape(
-            [batch_size * num_images_per_prompt, seq_len])
+        image_embeddings = image_embeddings.reshape([batch_size * num_images_per_prompt, seq_len])
         # image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, axis=0)
 
         return image_embeddings
 
     @paddle.no_grad()
     def __call__(
-            self,
-            image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image],
-                                  paddle.Tensor]]=None,
-            num_images_per_prompt: int=1,
-            decoder_num_inference_steps: int=25,
-            super_res_num_inference_steps: int=7,
-            generator: Optional[paddle.Generator]=None,
-            decoder_latents: Optional[paddle.Tensor]=None,
-            super_res_latents: Optional[paddle.Tensor]=None,
-            image_embeddings: Optional[paddle.Tensor]=None,
-            decoder_guidance_scale: float=8.0,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True, ):
+        self,
+        image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor]] = None,
+        num_images_per_prompt: int = 1,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[paddle.Generator] = None,
+        decoder_latents: Optional[paddle.Tensor] = None,
+        super_res_latents: Optional[paddle.Tensor] = None,
+        image_embeddings: Optional[paddle.Tensor] = None,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
         """
         Function invoked when calling the pipeline for generation.
 
@@ -307,23 +298,25 @@ def __call__(
         do_classifier_free_guidance = decoder_guidance_scale > 1.0
 
         prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
-            prompt, num_images_per_prompt, do_classifier_free_guidance)
+            prompt, num_images_per_prompt, do_classifier_free_guidance
+        )
 
-        image_embeddings = self._encode_image(image, num_images_per_prompt,
-                                              image_embeddings)
+        image_embeddings = self._encode_image(image, num_images_per_prompt, image_embeddings)
 
         # decoder
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeddings,
             prompt_embeds=prompt_embeds,
             text_encoder_hidden_states=text_encoder_hidden_states,
-            do_classifier_free_guidance=do_classifier_free_guidance, )
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
 
         decoder_text_mask = F.pad(
             text_mask.unsqueeze(0),
             (self.text_proj.clip_extra_context_tokens, 0),
             value=1,
-            data_format="NCL", ).squeeze(0)
+            data_format="NCL",
+        ).squeeze(0)
 
         self.decoder_scheduler.set_timesteps(decoder_num_inference_steps)
         decoder_timesteps_tensor = self.decoder_scheduler.timesteps
@@ -338,20 +331,22 @@ def __call__(
                 text_encoder_hidden_states.dtype,
                 generator,
                 decoder_latents,
-                self.decoder_scheduler, )
+                self.decoder_scheduler,
+            )
 
         for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([decoder_latents] * 2)
-                                  if do_classifier_free_guidance else
-                                  decoder_latents)
+            latent_model_input = (
+                paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+            )
 
             noise_pred = self.decoder(
                 sample=latent_model_input,
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 class_labels=additive_clip_time_embeddings,
-                attention_mask=decoder_text_mask, ).sample
+                attention_mask=decoder_text_mask,
+            ).sample
 
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
@@ -359,20 +354,19 @@ def __call__(
                 noise_pred_uncond, _ = noise_pred_uncond.split(
                     [
                         latent_model_input.shape[1],
-                        noise_pred_uncond.shape[1] -
-                        latent_model_input.shape[1],
+                        noise_pred_uncond.shape[1] - latent_model_input.shape[1],
                     ],
-                    axis=1, )
+                    axis=1,
+                )
                 noise_pred_text, predicted_variance = noise_pred_text.split(
                     [
                         latent_model_input.shape[1],
                         noise_pred_text.shape[1] - latent_model_input.shape[1],
                     ],
-                    axis=1, )
-                noise_pred = noise_pred_uncond + decoder_guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
-                noise_pred = paddle.concat(
-                    [noise_pred, predicted_variance], axis=1)
+                    axis=1,
+                )
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
 
             if i + 1 == decoder_timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -385,7 +379,8 @@ def __call__(
                 t,
                 decoder_latents,
                 prev_timestep=prev_timestep,
-                generator=generator, ).prev_sample
+                generator=generator,
+            ).prev_sample
 
         decoder_latents = decoder_latents.clip(-1, 1)
 
@@ -408,7 +403,8 @@ def __call__(
                 image_small.dtype,
                 generator,
                 super_res_latents,
-                self.super_res_scheduler, )
+                self.super_res_scheduler,
+            )
 
         interpolate_antialias = {}
         if "antialias" in inspect.signature(F.interpolate).parameters:
@@ -419,7 +415,8 @@ def __call__(
             size=[height, width],
             mode="bicubic",
             align_corners=False,
-            **interpolate_antialias, )
+            **interpolate_antialias,
+        )
 
         for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
             # no classifier free guidance
@@ -430,15 +427,14 @@ def __call__(
                 unet = self.super_res_first
 
             latent_model_input = paddle.concat(
-                [
-                    super_res_latents,
-                    image_upscaled.cast(super_res_latents.dtype)
-                ],
-                axis=1, )
+                [super_res_latents, image_upscaled.cast(super_res_latents.dtype)],
+                axis=1,
+            )
 
             noise_pred = unet(
                 sample=latent_model_input,
-                timestep=t, ).sample
+                timestep=t,
+            ).sample
 
             if i + 1 == super_res_timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -451,7 +447,8 @@ def __call__(
                 t,
                 super_res_latents,
                 prev_timestep=prev_timestep,
-                generator=generator, ).prev_sample
+                generator=generator,
+            ).prev_sample
 
         image = super_res_latents
 
@@ -467,6 +464,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
index 3ce07c27f08b6..69b442fa526ee 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
@@ -29,53 +29,52 @@ class UnCLIPTextProjModel(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            *,
-            clip_extra_context_tokens: int=4,
-            clip_embeddings_dim: int=768,
-            time_embed_dim: int,
-            cross_attention_dim, ):
+        self,
+        *,
+        clip_extra_context_tokens: int = 4,
+        clip_embeddings_dim: int = 768,
+        time_embed_dim: int,
+        cross_attention_dim,
+    ):
         super().__init__()
 
         self.learned_classifier_free_guidance_embeddings = self.create_parameter(
-            (clip_embeddings_dim, ),
+            (clip_embeddings_dim,),
             dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0), )
+            default_initializer=nn.initializer.Constant(0.0),
+        )
 
         # parameters for additional clip time embeddings
         self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
-        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(
-            clip_embeddings_dim, time_embed_dim)
+        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
 
         # parameters for encoder hidden states
         self.clip_extra_context_tokens = clip_extra_context_tokens
         self.clip_extra_context_tokens_proj = nn.Linear(
-            clip_embeddings_dim,
-            self.clip_extra_context_tokens * cross_attention_dim)
-        self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim,
-                                                    cross_attention_dim)
+            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
+        )
+        self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, cross_attention_dim)
         self.text_encoder_hidden_states_norm = nn.LayerNorm(cross_attention_dim)
 
     def forward(
-            self,
-            *,
-            image_embeddings,
-            prompt_embeds,
-            text_encoder_hidden_states,
-            do_classifier_free_guidance, ):
+        self,
+        *,
+        image_embeddings,
+        prompt_embeds,
+        text_encoder_hidden_states,
+        do_classifier_free_guidance,
+    ):
 
         image_embeddings = image_embeddings.cast(self.dtype)
 
         if do_classifier_free_guidance:
             # Add the classifier free guidance embeddings to the image embeddings
             image_embeddings_batch_size = image_embeddings.shape[0]
-            classifier_free_guidance_embeddings = (
-                self.learned_classifier_free_guidance_embeddings.unsqueeze(0))
-            classifier_free_guidance_embeddings = (
-                classifier_free_guidance_embeddings.expand(
-                    [image_embeddings_batch_size, -1]))
-            image_embeddings = paddle.concat(
-                [classifier_free_guidance_embeddings, image_embeddings], axis=0)
+            classifier_free_guidance_embeddings = self.learned_classifier_free_guidance_embeddings.unsqueeze(0)
+            classifier_free_guidance_embeddings = classifier_free_guidance_embeddings.expand(
+                [image_embeddings_batch_size, -1]
+            )
+            image_embeddings = paddle.concat([classifier_free_guidance_embeddings, image_embeddings], axis=0)
 
         # The image embeddings batch size and the text embeddings batch size are equal
         assert image_embeddings.shape[0] == prompt_embeds.shape[0]
@@ -85,26 +84,17 @@ def forward(
         # "Specifically, we modify the architecture described in Nichol et al. (2021) by projecting and
         # adding CLIP embeddings to the existing timestep embedding, ...
         time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
-        time_projected_image_embeddings = (
-            self.clip_image_embeddings_project_to_time_embeddings(
-                image_embeddings))
-        additive_clip_time_embeddings = (
-            time_projected_image_embeddings + time_projected_prompt_embeds)
+        time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
+        additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
 
         # ... and by projecting CLIP embeddings into four
         # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
-        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(
-            image_embeddings)
-        clip_extra_context_tokens = clip_extra_context_tokens.reshape(
-            [batch_size, -1, self.clip_extra_context_tokens])
-        clip_extra_context_tokens = clip_extra_context_tokens.transpose(
-            [0, 2, 1])
-
-        text_encoder_hidden_states = self.encoder_hidden_states_proj(
-            text_encoder_hidden_states)
-        text_encoder_hidden_states = self.text_encoder_hidden_states_norm(
-            text_encoder_hidden_states)
-        text_encoder_hidden_states = paddle.concat(
-            [clip_extra_context_tokens, text_encoder_hidden_states], axis=1)
+        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
+        clip_extra_context_tokens = clip_extra_context_tokens.reshape([batch_size, -1, self.clip_extra_context_tokens])
+        clip_extra_context_tokens = clip_extra_context_tokens.transpose([0, 2, 1])
+
+        text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
+        text_encoder_hidden_states = self.text_encoder_hidden_states_norm(text_encoder_hidden_states)
+        text_encoder_hidden_states = paddle.concat([clip_extra_context_tokens, text_encoder_hidden_states], axis=1)
 
         return text_encoder_hidden_states, additive_clip_time_embeddings
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
index 769e211a22e88..d0e447e0ef36e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
@@ -18,9 +18,13 @@
 import numpy as np
 import PIL
 
-from ...utils import (BaseOutput, OptionalDependencyNotAvailable,
-                      is_einops_available, is_paddle_available,
-                      is_paddlenlp_available)
+from ...utils import (
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    is_einops_available,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 
 @dataclass
@@ -40,12 +44,12 @@ class ImageTextPipelineOutput(BaseOutput):
 
 
 try:
-    if not (is_paddlenlp_available() and is_paddle_available() and
-            is_einops_available()):
+    if not (is_paddlenlp_available() and is_paddle_available() and is_einops_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import \
-        UniDiffuserPipeline
+    from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import (
+        UniDiffuserPipeline,
+    )
     from ...utils.dummy_paddle_and_paddlenlp_objects import CaptionDecoder
 else:
     from .caption_decoder import CaptionDecoder
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
index 5fd8b8659eb9a..81f5e5a0b5212 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
@@ -27,19 +27,20 @@
 class CaptionDecoder(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
-            self,
-            prefix_length: int=77,
-            hidden_dim: int=64,
-            vocab_size: int=50258,
-            hidden_size: int=768,
-            num_hidden_layers: int=12,
-            intermediate_size: int=3072,
-            hidden_act: int="gelu",
-            hidden_dropout_prob: int=0.1,
-            attention_probs_dropout_prob: int=0.1,
-            max_position_embeddings: int=1024,
-            initializer_range: int=0.02,
-            eos_token_id: int=50257, ):
+        self,
+        prefix_length: int = 77,
+        hidden_dim: int = 64,
+        vocab_size: int = 50258,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: int = 0.1,
+        attention_probs_dropout_prob: int = 0.1,
+        max_position_embeddings: int = 1024,
+        initializer_range: int = 0.02,
+        eos_token_id: int = 50257,
+    ):
         super(CaptionDecoder, self).__init__()
         self.prefix_length = prefix_length
         config = GPTConfig(
@@ -52,25 +53,24 @@ def __init__(
             attention_probs_dropout_prob=attention_probs_dropout_prob,
             max_position_embeddings=max_position_embeddings,
             initializer_range=initializer_range,
-            eos_token_id=eos_token_id, )
+            eos_token_id=eos_token_id,
+        )
         self.gpt = GPTLMHeadModel(config)
 
         self.hidden_dim = hidden_dim
-        self.encode_prefix = (nn.Linear(hidden_size, hidden_dim)
-                              if hidden_dim is not None else nn.Identity())
-        self.decode_prefix = (nn.Linear(hidden_dim, hidden_size)
-                              if hidden_dim is not None else nn.Identity())
+        self.encode_prefix = nn.Linear(hidden_size, hidden_dim) if hidden_dim is not None else nn.Identity()
+        self.decode_prefix = nn.Linear(hidden_dim, hidden_size) if hidden_dim is not None else nn.Identity()
 
     def get_dummy_token(self, batch_size: int) -> paddle.Tensor:
-        return paddle.zeros(
-            [batch_size, self.prefix_length], dtype=paddle.int64)
+        return paddle.zeros([batch_size, self.prefix_length], dtype=paddle.int64)
 
     def forward(
-            self,
-            tokens: paddle.Tensor,
-            prefix: paddle.Tensor,
-            attention_mask: Optional[paddle.Tensor]=None,
-            labels: Optional[paddle.Tensor]=None, ):
+        self,
+        tokens: paddle.Tensor,
+        prefix: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+    ):
         embedding_text = self.gpt.gpt.embeddings.word_embeddings(tokens)
         hidden = self.encode_prefix(prefix)
         prefix = self.decode_prefix(hidden)
@@ -79,9 +79,7 @@ def forward(
         if labels is not None:
             dummy_token = self.get_dummy_token(tokens.shape[0])
             labels = paddle.concat((dummy_token, tokens), axis=1)
-        out = self.gpt(inputs_embeds=embedding_cat,
-                       labels=labels,
-                       attention_mask=attention_mask)
+        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask)
 
         if self.hidden_dim:
             return out, hidden
@@ -98,24 +96,21 @@ def generate_captions(self, tokenizer, features, use_beam_search=True):
         for feature in features:
             feature = self.decode_prefix(feature)  # back to the clip feature
             if use_beam_search:
-                generated_captions.append(
-                    self.generate_beam(
-                        tokenizer=tokenizer, embedding=feature)[0])
+                generated_captions.append(self.generate_beam(tokenizer=tokenizer, embedding=feature)[0])
             else:
-                generated_captions.append(
-                    self.generate2(
-                        tokenizer=tokenizer, embedding=feature))
+                generated_captions.append(self.generate2(tokenizer=tokenizer, embedding=feature))
         return generated_captions
 
     @paddle.no_grad()
     def generate_beam(
-            self,
-            tokenizer,
-            prompt=None,
-            embedding=None,
-            beam_size: int=5,
-            entry_length: int=67,  # maximum number of words
-            temperature: float=1.0, ):
+        self,
+        tokenizer,
+        prompt=None,
+        embedding=None,
+        beam_size: int = 5,
+        entry_length: int = 67,  # maximum number of words
+        temperature: float = 1.0,
+    ):
         stop_token_index = self.gpt.config.eos_token_id
         tokens = None
         scores = None
@@ -132,14 +127,12 @@ def generate_beam(
 
         for i in range(entry_length):
             logits = self.gpt(inputs_embeds=generated)
-            logits = logits[:, -1, :] / (temperature
-                                         if temperature > 0 else 1.0)
+            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
             logits = F.softmax(logits, axis=-1).log()
             if scores is None:
                 scores, next_tokens = logits.topk(beam_size, -1)
                 generated = generated.expand([beam_size, *generated.shape[1:]])
-                next_tokens, scores = next_tokens.transpose(
-                    [1, 0]), scores.squeeze(0)
+                next_tokens, scores = next_tokens.transpose([1, 0]), scores.squeeze(0)
                 if tokens is None:
                     tokens = next_tokens
                 else:
@@ -151,8 +144,7 @@ def generate_beam(
                 scores_sum = scores[:, None] + logits
                 seq_lengths[~is_stopped] += 1
                 scores_sum_average = scores_sum / seq_lengths[:, None]
-                scores_sum_average, next_tokens = scores_sum_average.reshape(
-                    [-1]).topk(beam_size, -1)
+                scores_sum_average, next_tokens = scores_sum_average.reshape([-1]).topk(beam_size, -1)
                 next_tokens_source = next_tokens // scores_sum.shape[1]
                 seq_lengths = seq_lengths[next_tokens_source]
                 next_tokens = next_tokens % scores_sum.shape[1]
@@ -165,19 +157,18 @@ def generate_beam(
                 is_stopped = is_stopped[next_tokens_source]
                 is_stopped = paddle.cast(is_stopped, "bool")
 
-            next_token_embed = self.gpt.get_input_embeddings()(
-                next_tokens.squeeze()).reshape([generated.shape[0], 1, -1])
+            next_token_embed = self.gpt.get_input_embeddings()(next_tokens.squeeze()).reshape(
+                [generated.shape[0], 1, -1]
+            )
             generated = paddle.concat((generated, next_token_embed), axis=1)
-            is_stopped = paddle.bitwise_or(
-                is_stopped, next_tokens.equal(stop_token_index).squeeze())
+            is_stopped = paddle.bitwise_or(is_stopped, next_tokens.equal(stop_token_index).squeeze())
             if is_stopped.all():
                 break
 
         scores = scores / seq_lengths
         output_list = tokens.cpu().numpy()
         output_texts = [
-            tokenizer.decode(
-                output[:int(length)], skip_special_tokens=True)
+            tokenizer.decode(output[: int(length)], skip_special_tokens=True)
             for output, length in zip(output_list, seq_lengths)
         ]
         order = scores.argsort(descending=True)
@@ -186,15 +177,16 @@ def generate_beam(
 
     @paddle.no_grad()
     def generate2(
-            self,
-            tokenizer,
-            tokens=None,
-            prompt=None,
-            embedding=None,
-            entry_count: int=1,
-            entry_length: int=67,  # maximum number of words
-            top_p: float=0.8,
-            temperature: float=1.0, ):
+        self,
+        tokenizer,
+        tokens=None,
+        prompt=None,
+        embedding=None,
+        entry_count: int = 1,
+        entry_length: int = 67,  # maximum number of words
+        top_p: float = 0.8,
+        temperature: float = 1.0,
+    ):
         generated_list = []
         stop_token_index = self.gpt.config.eos_token_id
         filter_value = -float("Inf")
@@ -210,16 +202,12 @@ def generate2(
 
             for entry_idx in range(entry_length):
                 logits = self.gpt(inputs_embeds=generated)
-                logits = logits[:, -1, :] / (temperature
-                                             if temperature > 0 else 1.0)
+                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
                 sorted_logits = paddle.sort(logits, descending=True)
                 sorted_indices = paddle.argsort(logits, descending=True)
-                cumulative_probs = paddle.cumsum(
-                    F.softmax(
-                        sorted_logits, axis=-1), axis=-1)
+                cumulative_probs = paddle.cumsum(F.softmax(sorted_logits, axis=-1), axis=-1)
                 sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
-                    ..., :-1].clone()
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                 sorted_indices_to_remove[..., 0] = 0
 
                 indices_to_remove = sorted_indices[sorted_indices_to_remove]
@@ -235,8 +223,7 @@ def generate2(
                     break
 
             output_list = list(tokens.squeeze().cpu().numpy())
-            output_text = tokenizer.decode(
-                output_list, skip_special_tokens=True)
+            output_text = tokenizer.decode(output_list, skip_special_tokens=True)
             generated_list.append(output_text)
 
         return generated_list[0]
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
index 17bab677a8e47..c025b3e06973e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -19,9 +19,13 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel,
-                                    CLIPTokenizer,
-                                    CLIPVisionModelWithProjection, GPTTokenizer)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    GPTTokenizer,
+)
 from PIL import Image
 
 from ...models import AutoencoderKL, UViTModel
@@ -37,15 +41,15 @@
 def center_crop(width, height, img):
     resample = {"box": Image.BOX, "lanczos": Image.LANCZOS}["lanczos"]
     crop = np.min(img.shape[:2])
-    img = img[(img.shape[0] - crop) // 2:(img.shape[0] + crop) // 2, (img.shape[
-        1] - crop) // 2:(img.shape[1] + crop) // 2, ]  # center crop
+    img = img[
+        (img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2,
+        (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2,
+    ]  # center crop
     try:
         img = Image.fromarray(img, "RGB")
     except:
         img = Image.fromarray(img)
-    img = img.resize(
-        (width, height),
-        resample)  # resize the center crop from [crop, crop] to [width, height]
+    img = img.resize((width, height), resample)  # resize the center crop from [crop, crop] to [width, height]
     return np.array(img).astype(np.uint8)
 
 
@@ -62,16 +66,17 @@ class UniDiffuserPipeline(DiffusionPipeline):
     scheduler: DPMSolverUniDiffuserScheduler
 
     def __init__(
-            self,
-            image_encoder: CLIPVisionModelWithProjection,
-            image_feature_extractor: CLIPImageProcessor,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UViTModel,
-            vae: AutoencoderKL,
-            caption_decoder: CaptionDecoder,
-            caption_tokenizer: GPTTokenizer,
-            scheduler: DPMSolverUniDiffuserScheduler, ):
+        self,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UViTModel,
+        vae: AutoencoderKL,
+        caption_decoder: CaptionDecoder,
+        caption_tokenizer: GPTTokenizer,
+        scheduler: DPMSolverUniDiffuserScheduler,
+    ):
         super().__init__()
         self.register_modules(
             image_encoder=image_encoder,
@@ -82,51 +87,48 @@ def __init__(
             vae=vae,
             caption_decoder=caption_decoder,
             caption_tokenizer=caption_tokenizer,
-            scheduler=scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
         self.num_channels_latents = vae.latent_channels  # 4
         self.image_encoder_clip_img_dim = image_encoder.config.projection_dim  # 512
         self.text_encoder_seq_len = tokenizer.model_max_length  # 77
-        self.text_encoder_text_dim = (
-            text_encoder.config.hidden_size //
-            text_encoder.config.num_attention_heads)  # 64
+        self.text_encoder_text_dim = text_encoder.config.hidden_size // text_encoder.config.num_attention_heads  # 64
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -139,10 +141,10 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
-    def _infer_batch_size(self, mode, image, prompt, prompt_embeds,
-                          num_samples):
+    def _infer_batch_size(self, mode, image, prompt, prompt_embeds, num_samples):
         if mode in ["t2i", "t2i2t"]:
             if prompt is not None and isinstance(prompt, str):
                 batch_size = 1
@@ -169,20 +171,16 @@ def _split(self, x, height, width):
         latent_width = width // self.vae_scale_factor
         img_vae_dim = self.num_channels_latents * latent_height * latent_width
 
-        img_vae, img_clip = x.split(
-            [img_vae_dim, self.image_encoder_clip_img_dim], axis=1)
+        img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_clip_img_dim], axis=1)
 
         img_vae = einops.rearrange(
             img_vae,
             "B (C H W) -> B C H W",
             C=self.num_channels_latents,
             H=latent_height,
-            W=latent_width, )
-        img_clip = einops.rearrange(
-            img_clip,
-            "B (L D) -> B L D",
-            L=1,
-            D=self.image_encoder_clip_img_dim)
+            W=latent_width,
+        )
+        img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim)
         return img_vae, img_clip
 
     def _combine(self, img_vae, img_clip):
@@ -205,24 +203,21 @@ def _split_joint(self, x, height, width):
         img_vae_dim = self.num_channels_latents * latent_height * latent_width
         text_dim = self.text_encoder_seq_len * self.text_encoder_text_dim
 
-        img_vae, img_clip, text = x.split(
-            [img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1)
+        img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1)
         img_vae = einops.rearrange(
             img_vae,
             "B (C H W) -> B C H W",
             C=self.num_channels_latents,
             H=latent_height,
-            W=latent_width, )
-        img_clip = einops.rearrange(
-            img_clip,
-            "B (L D) -> B L D",
-            L=1,
-            D=self.image_encoder_clip_img_dim)
+            W=latent_width,
+        )
+        img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim)
         text = einops.rearrange(
             text,
             "B (L D) -> B L D",
             L=self.text_encoder_seq_len,
-            D=self.text_encoder_text_dim, )
+            D=self.text_encoder_text_dim,
+        )
         return img_vae, img_clip, text
 
     def _combine_joint(self, img_vae, img_clip, text):
@@ -238,34 +233,29 @@ def _combine_joint(self, img_vae, img_clip, text):
 
     # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def encode_text_latents(
-            self,
-            prompt,
-            num_images_per_prompt,
-            negative_prompt=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+    ):
         if prompt_embeds is None:
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             prompt_embeds = self.text_encoder(text_inputs.input_ids)[0]
 
         return prompt_embeds
 
     # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
-    def encode_image_vae_latents(self,
-                                 image,
-                                 batch_size,
-                                 num_images_per_prompt,
-                                 dtype,
-                                 generator=None):
+    def encode_image_vae_latents(self, image, batch_size, num_images_per_prompt, dtype, generator=None):
         if not isinstance(image, paddle.Tensor):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, but is {type(image)}"
-            )
+            raise ValueError(f"`image` has to be of type `paddle.Tensor`, but is {type(image)}")
         image = image.cast(dtype)
 
         batch_size = batch_size * num_images_per_prompt
@@ -278,17 +268,14 @@ def encode_image_vae_latents(self,
         # vae encode
         if isinstance(generator, list):
             image_latents = [
-                self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i])
-                * self.vae.scaling_factor for i in range(batch_size)
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) * self.vae.scaling_factor
+                for i in range(batch_size)
             ]
             image_latents = paddle.concat(image_latents, axis=0)
         else:
-            image_latents = (
-                self.vae.encode(image).latent_dist.sample(generator) *
-                self.vae.scaling_factor)
+            image_latents = self.vae.encode(image).latent_dist.sample(generator) * self.vae.scaling_factor
 
-        if (batch_size > image_latents.shape[0] and
-                batch_size % image_latents.shape[0] != 0):
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -299,22 +286,20 @@ def encode_image_vae_latents(self,
 
     # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
     def encode_image_clip_latents(
-            self,
-            image,
-            batch_size,
-            num_images_per_prompt,
-            dtype, ):
+        self,
+        image,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+    ):
         batch_size = batch_size * num_images_per_prompt
 
         # clip encode
-        inputs = self.image_feature_extractor(
-            images=Image.fromarray(image), return_tensors="pd").pixel_values
+        inputs = self.image_feature_extractor(images=Image.fromarray(image), return_tensors="pd").pixel_values
         # TODO junnyu, support float16 we need cast dtype
-        image_latents = self.image_encoder(
-            inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1)
+        image_latents = self.image_encoder(inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1)
 
-        if (batch_size > image_latents.shape[0] and
-                batch_size % image_latents.shape[0] != 0):
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
             raise ValueError(
                 f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
             )
@@ -333,13 +318,7 @@ def decode_image_latents(self, latents):
         return image
 
     # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_text_latents(self,
-                             batch_size,
-                             seq_len,
-                             hidden_size,
-                             dtype,
-                             generator,
-                             latents=None):
+    def prepare_text_latents(self, batch_size, seq_len, hidden_size, dtype, generator, latents=None):
         # Prepare text latents for the CLIP embedded prompt.
         shape = [batch_size, seq_len, hidden_size]
         if isinstance(generator, list) and len(generator) != batch_size:
@@ -357,14 +336,15 @@ def prepare_text_latents(self,
 
     # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_image_vae_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         # Prepare latents for the VAE embedded image.
         shape = [
             batch_size,
@@ -386,12 +366,7 @@ def prepare_image_vae_latents(
         return latents
 
     # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_image_clip_latents(self,
-                                   batch_size,
-                                   clip_img_dim,
-                                   dtype,
-                                   generator,
-                                   latents=None):
+    def prepare_image_clip_latents(self, batch_size, clip_img_dim, dtype, generator, latents=None):
         # Prepare latents for the CLIP embedded image.
         shape = [batch_size, 1, clip_img_dim]
         if isinstance(generator, list) and len(generator) != batch_size:
@@ -408,66 +383,61 @@ def prepare_image_clip_latents(self,
         return latents
 
     def get_noise_pred(
-            self,
-            mode,
-            latents,
-            t,
-            img_vae,
-            img_clip,
-            prompt_embeds,
-            N,
-            guidance_scale,
-            height,
-            width,
-            data_type=1,
-            generator=None, ):
+        self,
+        mode,
+        latents,
+        t,
+        img_vae,
+        img_clip,
+        prompt_embeds,
+        N,
+        guidance_scale,
+        height,
+        width,
+        data_type=1,
+        generator=None,
+    ):
         dtype = self.unet.dtype
         if mode == "joint":
-            img_vae_latents, img_clip_latents, text_latents = self._split_joint(
-                latents, height, width)
+            img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width)
             img_vae_out, img_clip_out, text_out = self.unet(
                 img=img_vae_latents,
                 clip_img=img_clip_latents,
                 text=text_latents,
                 t_img=t,
                 t_text=t,
-                data_type=paddle.zeros_like(
-                    t, dtype=paddle.int32) + data_type, )
+                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+            )
             x_out = self._combine_joint(img_vae_out, img_clip_out, text_out)
 
             if guidance_scale == 0.0:
                 return x_out
 
-            img_vae_T = randn_tensor(
-                img_vae.shape, generator=generator, dtype=dtype)
-            img_clip_T = randn_tensor(
-                img_clip.shape, generator=generator, dtype=dtype)
+            img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype)
+            img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype)
             _, _, text_out_uncond = self.unet(
                 img=img_vae_T,
                 clip_img=img_clip_T,
                 text=text_latents,
                 t_img=paddle.ones_like(t) * N,
                 t_text=t,
-                data_type=paddle.zeros_like(
-                    t, dtype=paddle.int32) + data_type, )
-            text_T = randn_tensor(
-                prompt_embeds.shape, generator=generator, dtype=dtype)
+                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+            )
+            text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype)
             img_vae_out_uncond, img_clip_out_uncond, _ = self.unet(
                 img=img_vae_latents,
                 clip_img=img_clip_latents,
                 text=text_T,
                 t_img=t,
                 t_text=paddle.ones_like(t) * N,
-                data_type=paddle.zeros_like(
-                    t, dtype=paddle.int32) + data_type, )
-            x_out_uncond = self._combine_joint(
-                img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
+                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+            )
+            x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
 
             return x_out + guidance_scale * (x_out - x_out_uncond)
 
         elif mode == "t2i":
-            img_vae_latents, img_clip_latents = self._split(latents, height,
-                                                            width)
+            img_vae_latents, img_clip_latents = self._split(latents, height, width)
             t_text = paddle.zeros([t.shape[0]], dtype=paddle.int32)
             img_vae_out, img_clip_out, text_out = self.unet(
                 img=img_vae_latents,
@@ -475,25 +445,23 @@ def get_noise_pred(
                 text=prompt_embeds,
                 t_img=t,
                 t_text=t_text,
-                data_type=paddle.zeros_like(
-                    t_text, dtype=paddle.int32) + data_type, )
+                data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
+            )
             img_out = self._combine(img_vae_out, img_clip_out)
 
             if guidance_scale == 0.0:
                 return img_out
 
-            text_T = randn_tensor(
-                prompt_embeds.shape, generator=generator, dtype=dtype)
+            text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype)
             img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
                 img=img_vae_latents,
                 clip_img=img_clip_latents,
                 text=text_T,
                 t_img=t,
                 t_text=paddle.ones_like(t) * N,
-                data_type=paddle.zeros_like(
-                    t_text, dtype=paddle.int32) + data_type, )
-            img_out_uncond = self._combine(img_vae_out_uncond,
-                                           img_clip_out_uncond)
+                data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
+            )
+            img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond)
 
             return img_out + guidance_scale * (img_out - img_out_uncond)
 
@@ -505,23 +473,21 @@ def get_noise_pred(
                 text=latents,
                 t_img=t_img,
                 t_text=t,
-                data_type=paddle.zeros_like(
-                    t_img, dtype=paddle.int32) + data_type, )
+                data_type=paddle.zeros_like(t_img, dtype=paddle.int32) + data_type,
+            )
             if guidance_scale == 0.0:
                 return text_out
 
-            img_vae_T = randn_tensor(
-                img_vae.shape, generator=generator, dtype=dtype)
-            img_clip_T = randn_tensor(
-                img_clip.shape, generator=generator, dtype=dtype)
+            img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype)
+            img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype)
             img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
                 img=img_vae_T,
                 clip_img=img_clip_T,
                 text=latents,
                 t_img=paddle.ones_like(t) * N,
                 t_text=t,
-                data_type=paddle.zeros_like(
-                    t, dtype=paddle.int32) + data_type, )
+                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+            )
             return text_out + guidance_scale * (text_out - text_out_uncond)
 
         elif mode == "t":
@@ -531,13 +497,12 @@ def get_noise_pred(
                 text=latents,
                 t_img=paddle.ones_like(t) * N,
                 t_text=t,
-                data_type=paddle.zeros_like(
-                    t, dtype=paddle.int32) + data_type, )
+                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
+            )
             return text_out
 
         elif mode == "i":
-            img_vae_latents, img_clip_latents = self._split(latents, height,
-                                                            width)
+            img_vae_latents, img_clip_latents = self._split(latents, height, width)
             t_text = paddle.ones_like(t) * N
             img_vae_out, img_clip_out, text_out = self.unet(
                 img=img_vae_latents,
@@ -545,8 +510,8 @@ def get_noise_pred(
                 text=prompt_embeds,
                 t_img=t,
                 t_text=t_text,
-                data_type=paddle.zeros_like(
-                    t_text, dtype=paddle.int32) + data_type, )
+                data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
+            )
             img_out = self._combine(img_vae_out, img_clip_out)
             return img_out
 
@@ -557,36 +522,34 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def _denoising_sample_fn(
-            self,
-            mode,
-            image_vae_latents,
-            image_clip_latents,
-            prompt_embeds,
-            num_inference_steps,
-            extra_step_kwargs,
-            guidance_scale,
-            height,
-            width,
-            callback,
-            callback_steps, ):
+        self,
+        mode,
+        image_vae_latents,
+        image_clip_latents,
+        prompt_embeds,
+        num_inference_steps,
+        extra_step_kwargs,
+        guidance_scale,
+        height,
+        width,
+        callback,
+        callback_steps,
+    ):
         # Prepare latent variables
         if mode == "joint":
-            latents = self._combine_joint(image_vae_latents, image_clip_latents,
-                                          prompt_embeds)
+            latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds)
         elif mode in ["t2i", "i"]:
             latents = self._combine(image_vae_latents, image_clip_latents)
         elif mode in ["i2t", "t"]:
@@ -599,8 +562,7 @@ def _denoising_sample_fn(
         timesteps = self.scheduler.timesteps
         N = self.scheduler.config.num_train_timesteps
 
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 noise_pred = self.get_noise_pred(
@@ -613,27 +575,23 @@ def _denoising_sample_fn(
                     N,
                     guidance_scale,
                     height,
-                    width, )
+                    width,
+                )
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents,
-                                              **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
         if mode == "joint":
-            image_vae_latents, image_clip_latents, text_latents = self._split_joint(
-                latents, height, width)
+            image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width)
             return image_vae_latents, image_clip_latents, text_latents
         elif mode in ["t2i", "i"]:
-            image_vae_latents, image_clip_latents = self._split(latents, height,
-                                                                width)
+            image_vae_latents, image_clip_latents = self._split(latents, height, width)
             return image_vae_latents, image_clip_latents
         elif mode in ["i2t", "t"]:
             text_latents = latents
@@ -641,32 +599,32 @@ def _denoising_sample_fn(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            mode: str="t2i",  # t2i, i2t, t2i2t, i2t2i, joint, i, t
-            image: Optional[Union[paddle.Tensor, PIL.Image.Image]]=None,
-            prompt: Optional[Union[str, List[str]]]=None,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.0,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            num_prompts_per_image: Optional[int]=1,
-            num_samples: int=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            prompt_latents: Optional[paddle.Tensor]=None,
-            vae_latents: Optional[paddle.Tensor]=None,
-            clip_latents: Optional[paddle.Tensor]=None,
-            prompt_embeds: Optional[paddle.Tensor]=None,
-            negative_prompt_embeds: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            use_beam_search: Optional[bool]=True,
-            **kwargs, ):
+        self,
+        mode: str = "t2i",  # t2i, i2t, t2i2t, i2t2i, joint, i, t
+        image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        num_prompts_per_image: Optional[int] = 1,
+        num_samples: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        prompt_latents: Optional[paddle.Tensor] = None,
+        vae_latents: Optional[paddle.Tensor] = None,
+        clip_latents: Optional[paddle.Tensor] = None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        use_beam_search: Optional[bool] = True,
+        **kwargs,
+    ):
         # 0. Default height and width to unet
         height = height or self.unet.config.img_size * self.vae_scale_factor
         width = width or self.unet.config.img_size * self.vae_scale_factor
@@ -679,8 +637,7 @@ def __call__(
             self.check_inputs([prompt], height, width, callback_steps)
 
         # 2. Define call parameters
-        batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds,
-                                            num_samples)
+        batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds, num_samples)
 
         # 3. Encode input prompt if available; otherwise prepare text latents
         if mode in ["t2i", "t2i2t"]:
@@ -691,7 +648,8 @@ def __call__(
                 num_images_per_prompt,
                 negative_prompt,
                 prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds, )
+                negative_prompt_embeds=negative_prompt_embeds,
+            )
             # Encode contexts to lower text dim, 768 -> 64
             prompt_embeds = self.unet.encode_prefix(prompt_embeds)
         else:
@@ -700,10 +658,10 @@ def __call__(
                 batch_size,
                 self.text_encoder_seq_len,
                 self.text_encoder_text_dim,
-                paddle.
-                float32,  # Placeholder, need to determine correct thing to do for dtype
+                paddle.float32,  # Placeholder, need to determine correct thing to do for dtype
                 generator,
-                prompt_latents, )
+                prompt_latents,
+            )
 
         # 4. Encode input image if available; otherwise prepare image latents
         if mode in ["i2t", "i2t2i"]:
@@ -716,7 +674,8 @@ def __call__(
                 image_crop,
                 batch_size,
                 num_prompts_per_image,  # not num_images_per_prompt
-                prompt_embeds.dtype, )
+                prompt_embeds.dtype,
+            )
             # Encode image using VAE
             image_vae = (image_crop / 127.5 - 1.0).astype(np.float32)
             image_vae = einops.rearrange(image_vae, "h w c -> 1 c h w")
@@ -725,7 +684,8 @@ def __call__(
                 batch_size,
                 num_prompts_per_image,  # not num_images_per_prompt
                 prompt_embeds.dtype,
-                generator, )
+                generator,
+            )
 
         else:
             # 4.2. Prepare image latent variables, if necessary
@@ -735,7 +695,8 @@ def __call__(
                 self.image_encoder_clip_img_dim,
                 prompt_embeds.dtype,
                 generator,
-                clip_latents, )
+                clip_latents,
+            )
             # Prepare image VAE latents
             image_vae_latents = self.prepare_image_vae_latents(
                 batch_size * num_images_per_prompt,
@@ -744,7 +705,8 @@ def __call__(
                 width,
                 prompt_embeds.dtype,
                 generator,
-                vae_latents, )
+                vae_latents,
+            )
 
         # 5. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -762,7 +724,8 @@ def __call__(
                 height,
                 width,
                 callback,
-                callback_steps, )
+                callback_steps,
+            )
         elif mode in ["i2t2i"]:
             # 'i2t2i' should do 'i2t' first
             outs = self._denoising_sample_fn(
@@ -776,7 +739,8 @@ def __call__(
                 height,
                 width,
                 callback,
-                callback_steps, )
+                callback_steps,
+            )
         elif mode in ["t2i2t"]:
             # 't2i2t' should do 't2i' first
             outs = self._denoising_sample_fn(
@@ -790,7 +754,8 @@ def __call__(
                 height,
                 width,
                 callback,
-                callback_steps, )
+                callback_steps,
+            )
         else:
             raise ValueError
 
@@ -800,9 +765,8 @@ def __call__(
             image_vae_latents, image_clip_latents, text_latents = outs
             gen_image = self.decode_image_latents(image_vae_latents)
             gen_text = self.caption_decoder.generate_captions(
-                self.caption_tokenizer,
-                text_latents,
-                use_beam_search=use_beam_search)
+                self.caption_tokenizer, text_latents, use_beam_search=use_beam_search
+            )
 
         elif mode in ["t2i", "i", "t2i2t"]:
             image_vae_latents, image_clip_latents = outs
@@ -814,10 +778,10 @@ def __call__(
                     batch_size,
                     self.text_encoder_seq_len,
                     self.text_encoder_text_dim,
-                    paddle.
-                    float32,  # Placeholder, need to determine correct thing to do for dtype
+                    paddle.float32,  # Placeholder, need to determine correct thing to do for dtype
                     generator,
-                    prompt_latents, )
+                    prompt_latents,
+                )
                 text_latents = self._denoising_sample_fn(
                     "i2t",
                     image_vae_latents,
@@ -829,11 +793,13 @@ def __call__(
                     height,
                     width,
                     callback,
-                    callback_steps, )
+                    callback_steps,
+                )
                 gen_text = self.caption_decoder.generate_captions(
                     self.caption_tokenizer,
                     text_latents,
-                    use_beam_search=use_beam_search, )
+                    use_beam_search=use_beam_search,
+                )
 
         elif mode in ["i2t", "t", "i2t2i"]:
             text_latents = outs
@@ -841,7 +807,8 @@ def __call__(
                 gen_text = self.caption_decoder.generate_captions(
                     self.caption_tokenizer,
                     text_latents,
-                    use_beam_search=use_beam_search, )
+                    use_beam_search=use_beam_search,
+                )
             else:
                 # 'i2t2i' should do 't2i' later
                 # Prepare image CLIP latents
@@ -850,7 +817,8 @@ def __call__(
                     self.image_encoder_clip_img_dim,
                     prompt_embeds.dtype,
                     generator,
-                    clip_latents, )
+                    clip_latents,
+                )
                 # Prepare image VAE latents
                 image_vae_latents = self.prepare_image_vae_latents(
                     batch_size * num_images_per_prompt,
@@ -859,7 +827,8 @@ def __call__(
                     width,
                     prompt_embeds.dtype,
                     generator,
-                    vae_latents, )
+                    vae_latents,
+                )
                 image_vae_latents, image_clip_latents = self._denoising_sample_fn(
                     "t2i",
                     image_vae_latents,
@@ -871,7 +840,8 @@ def __call__(
                     height,
                     width,
                     callback,
-                    callback_steps, )
+                    callback_steps,
+                )
                 gen_image = self.decode_image_latents(image_vae_latents)
 
         # 8. Convert gen_image to PIL, gen_text has no else processing
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
index ac2ddb173413d..309b32b2d1129 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
@@ -13,8 +13,11 @@
 # limitations under the License.
 # flake8: noqa
 
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
-                      is_paddlenlp_available)
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_paddlenlp_available,
+)
 
 try:
     if not (is_paddlenlp_available() and is_paddle_available()):
@@ -22,14 +25,19 @@
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_and_paddlenlp_objects import (
         VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline)
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+    )
 else:
     from .modeling_text_unet import UNetFlatConditionModel
     from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
-    from .pipeline_versatile_diffusion_dual_guided import \
-        VersatileDiffusionDualGuidedPipeline
-    from .pipeline_versatile_diffusion_image_variation import \
-        VersatileDiffusionImageVariationPipeline
-    from .pipeline_versatile_diffusion_text_to_image import \
-        VersatileDiffusionTextToImagePipeline
+    from .pipeline_versatile_diffusion_dual_guided import (
+        VersatileDiffusionDualGuidedPipeline,
+    )
+    from .pipeline_versatile_diffusion_image_variation import (
+        VersatileDiffusionImageVariationPipeline,
+    )
+    from .pipeline_versatile_diffusion_text_to_image import (
+        VersatileDiffusionTextToImagePipeline,
+    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 69099f5186cf6..377ab850f1e93 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -24,11 +24,13 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
 from ...models.attention import Attention
-from ...models.attention_processor import (AttentionProcessor,
-                                           AttnAddedKVProcessor, AttnProcessor)
+from ...models.attention_processor import (
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from ...models.dual_transformer_2d import DualTransformer2DModel
-from ...models.embeddings import (GaussianFourierProjection, TimestepEmbedding,
-                                  Timesteps)
+from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from ...models.transformer_2d import Transformer2DModel
 from ...models.unet_2d_condition import UNet2DConditionOutput
 from ...utils import NEG_INF, deprecate, logging
@@ -37,30 +39,29 @@
 
 
 def get_down_block(
-        down_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        temb_channels,
-        add_downsample,
-        resnet_eps,
-        resnet_act_fn,
-        attn_num_head_channels,
-        resnet_groups=None,
-        cross_attention_dim=None,
-        downsample_padding=None,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-        resnet_time_scale_shift="default",
-        resnet_skip_time_act=False,  # HF missing in v0.16.1
-        resnet_out_scale_factor=1.0,  # HF missing in v0.16.1
-        cross_attention_norm=None,  # HF missing in v0.16.1
-        resnet_pre_temb_non_linearity: bool=False, ):
-    down_block_type = (down_block_type[7:]
-                       if down_block_type.startswith("UNetRes") else
-                       down_block_type)
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,  # HF missing in v0.16.1
+    resnet_out_scale_factor=1.0,  # HF missing in v0.16.1
+    cross_attention_norm=None,  # HF missing in v0.16.1
+    resnet_pre_temb_non_linearity: bool = False,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlockFlat":
         return DownBlockFlat(
             num_layers=num_layers,
@@ -73,12 +74,11 @@ def get_down_block(
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif down_block_type == "CrossAttnDownBlockFlat":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnDownBlockFlat"
-            )
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat")
         return CrossAttnDownBlockFlat(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -95,34 +95,35 @@ def get_down_block(
             use_linear_projection=use_linear_projection,
             only_cross_attention=only_cross_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     raise ValueError(f"{down_block_type} is not supported.")
 
 
 def get_up_block(
-        up_block_type,
-        num_layers,
-        in_channels,
-        out_channels,
-        prev_output_channel,
-        temb_channels,
-        add_upsample,
-        resnet_eps,
-        resnet_act_fn,
-        attn_num_head_channels,
-        resnet_groups=None,
-        cross_attention_dim=None,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-        resnet_time_scale_shift="default",
-        resnet_skip_time_act=False,  # HF missing in v0.16.1
-        resnet_out_scale_factor=1.0,  # HF missing in v0.16.1
-        cross_attention_norm=None,  # HF missing in v0.16.1
-        resnet_pre_temb_non_linearity: bool=False, ):
-    up_block_type = (up_block_type[7:]
-                     if up_block_type.startswith("UNetRes") else up_block_type)
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,  # HF missing in v0.16.1
+    resnet_out_scale_factor=1.0,  # HF missing in v0.16.1
+    cross_attention_norm=None,  # HF missing in v0.16.1
+    resnet_pre_temb_non_linearity: bool = False,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlockFlat":
         return UpBlockFlat(
             num_layers=num_layers,
@@ -135,11 +136,11 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     elif up_block_type == "CrossAttnUpBlockFlat":
         if cross_attention_dim is None:
-            raise ValueError(
-                "cross_attention_dim must be specified for CrossAttnUpBlockFlat")
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat")
         return CrossAttnUpBlockFlat(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -156,7 +157,8 @@ def get_up_block(
             use_linear_projection=use_linear_projection,
             only_cross_attention=only_cross_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+        )
     raise ValueError(f"{up_block_type} is not supported.")
 
 
@@ -236,54 +238,57 @@ class conditioning with `class_embed_type` equal to `None`.
 
     @register_to_config
     def __init__(
-            self,
-            sample_size: Optional[int]=None,
-            in_channels: int=4,
-            out_channels: int=4,
-            center_input_sample: bool=False,
-            flip_sin_to_cos: bool=True,
-            freq_shift: int=0,
-            down_block_types: Tuple[str]=(
-                "CrossAttnDownBlockFlat",
-                "CrossAttnDownBlockFlat",
-                "CrossAttnDownBlockFlat",
-                "DownBlockFlat", ),
-            mid_block_type: Optional[str]="UNetMidBlockFlatCrossAttn",
-            up_block_types: Tuple[str]=(
-                "UpBlockFlat",
-                "CrossAttnUpBlockFlat",
-                "CrossAttnUpBlockFlat",
-                "CrossAttnUpBlockFlat", ),
-            only_cross_attention: Union[bool, Tuple[bool]]=False,
-            block_out_channels: Tuple[int]=(320, 640, 1280, 1280),
-            layers_per_block: Union[int, Tuple[int]]=2,
-            downsample_padding: int=1,
-            mid_block_scale_factor: float=1,
-            act_fn: str="silu",
-            norm_num_groups: Optional[int]=32,
-            norm_eps: float=1e-5,
-            cross_attention_dim: Union[int, Tuple[int]]=1280,
-            encoder_hid_dim: Optional[int]=None,
-            attention_head_dim: Union[int, Tuple[int]]=8,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            class_embed_type: Optional[str]=None,
-            num_class_embeds: Optional[int]=None,
-            upcast_attention: bool=False,
-            resnet_time_scale_shift: str="default",
-            resnet_skip_time_act: bool=False,
-            resnet_out_scale_factor: int=1.0,
-            time_embedding_type: str="positional",  # fourier, positional
-            time_embedding_act_fn: Optional[str]=None,
-            timestep_post_act: Optional[str]=None,
-            time_cond_proj_dim: Optional[int]=None,
-            conv_in_kernel: int=3,
-            conv_out_kernel: int=3,
-            projection_class_embeddings_input_dim: Optional[int]=None,
-            class_embeddings_concat: bool=False,
-            mid_block_only_cross_attention: Optional[bool]=None,
-            cross_attention_norm: Optional[str]=None,
-            resnet_pre_temb_non_linearity: Optional[bool]=False, ):
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "DownBlockFlat",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        encoder_hid_dim: Optional[int] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",  # fourier, positional
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        resnet_pre_temb_non_linearity: Optional[bool] = False,
+    ):
         super().__init__()
 
         self.sample_size = sample_size
@@ -292,7 +297,8 @@ def __init__(
         if len(down_block_types) != len(up_block_types):
             raise ValueError(
                 "Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`:"
-                f" {down_block_types}. `up_block_types`: {up_block_types}.")
+                f" {down_block_types}. `up_block_types`: {up_block_types}."
+            )
 
         if len(block_out_channels) != len(down_block_types):
             raise ValueError(
@@ -300,35 +306,28 @@ def __init__(
                 f" {block_out_channels}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                only_cross_attention,
-                bool) and len(only_cross_attention) != len(down_block_types):
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
             raise ValueError(
                 "Must provide the same number of `only_cross_attention` as `down_block_types`."
                 f" `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                attention_head_dim,
-                int) and len(attention_head_dim) != len(down_block_types):
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
             raise ValueError(
                 "Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`:"
                 f" {attention_head_dim}. `down_block_types`: {down_block_types}."
             )
-        if isinstance(
-                cross_attention_dim,
-                list) and len(cross_attention_dim) != len(down_block_types):
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
             raise ValueError(
                 "Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`:"
                 f" {cross_attention_dim}. `down_block_types`: {down_block_types}."
             )
 
-        if not isinstance(
-                layers_per_block,
-                int) and len(layers_per_block) != len(down_block_types):
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
             raise ValueError(
                 "Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`:"
-                f" {layers_per_block}. `down_block_types`: {down_block_types}.")
+                f" {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
 
         # input
         conv_in_padding = (conv_in_kernel - 1) // 2
@@ -336,26 +335,25 @@ def __init__(
             in_channels,
             block_out_channels[0],
             kernel_size=conv_in_kernel,
-            padding=conv_in_padding, )
+            padding=conv_in_padding,
+        )
 
         # time
         if time_embedding_type == "fourier":
             time_embed_dim = block_out_channels[0] * 2
             if time_embed_dim % 2 != 0:
-                raise ValueError(
-                    f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
-                )
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
             self.time_proj = GaussianFourierProjection(
                 time_embed_dim // 2,
                 set_W_to_weight=False,
                 log=False,
-                flip_sin_to_cos=flip_sin_to_cos, )
+                flip_sin_to_cos=flip_sin_to_cos,
+            )
             timestep_input_dim = time_embed_dim
         elif time_embedding_type == "positional":
             time_embed_dim = block_out_channels[0] * 4
 
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
-                                       freq_shift)
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
         else:
             raise ValueError(
@@ -367,20 +365,18 @@ def __init__(
             time_embed_dim,
             act_fn=act_fn,
             post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim, )
+            cond_proj_dim=time_cond_proj_dim,
+        )
         if encoder_hid_dim is not None:
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim,
-                                              cross_attention_dim)
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
         else:
             self.encoder_hid_proj = None
 
         # class embedding
         if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds,
-                                                time_embed_dim)
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
         elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim,
-                                                     time_embed_dim)
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
         elif class_embed_type == "projection":
@@ -395,15 +391,13 @@ def __init__(
             # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
             # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(
-                projection_class_embeddings_input_dim, time_embed_dim)
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
         elif class_embed_type == "simple_projection":
             if projection_class_embeddings_input_dim is None:
                 raise ValueError(
                     "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
                 )
-            self.class_embedding = nn.Linear(
-                projection_class_embeddings_input_dim, time_embed_dim)
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
 
@@ -418,8 +412,7 @@ def __init__(
         elif time_embedding_act_fn == "gelu":
             self.time_embed_act = nn.GELU()
         else:
-            raise ValueError(
-                f"Unsupported activation function: {time_embedding_act_fn}")
+            raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
 
         self.down_blocks = nn.LayerList([])
         self.up_blocks = nn.LayerList([])
@@ -440,18 +433,16 @@ def __init__(
             if mid_block_only_cross_attention is None:
                 mid_block_only_cross_attention = only_cross_attention
 
-            only_cross_attention = [only_cross_attention] * len(
-                down_block_types)
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
 
         if mid_block_only_cross_attention is None:
             mid_block_only_cross_attention = False
 
         if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
         if isinstance(cross_attention_dim, int):
-            cross_attention_dim = (
-                cross_attention_dim, ) * len(down_block_types)
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
 
         if isinstance(layers_per_block, int):
             layers_per_block = [layers_per_block] * len(down_block_types)
@@ -492,7 +483,8 @@ def __init__(
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
             self.down_blocks.append(down_block)
 
         # mid
@@ -510,7 +502,8 @@ def __init__(
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
                 upcast_attention=upcast_attention,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn":
             self.mid_block = UNetMidBlockFlatSimpleCrossAttn(
                 in_channels=block_out_channels[-1],
@@ -525,7 +518,8 @@ def __init__(
                 skip_time_act=resnet_skip_time_act,
                 only_cross_attention=mid_block_only_cross_attention,
                 cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         elif mid_block_type is None:
             self.mid_block = None
         else:
@@ -547,8 +541,7 @@ def __init__(
 
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(
-                i + 1, len(block_out_channels) - 1)]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
 
             # add upsample block for all BUT final layer
             if not is_final_block:
@@ -578,7 +571,8 @@ def __init__(
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
@@ -587,7 +581,8 @@ def __init__(
             self.conv_norm_out = nn.GroupNorm(
                 num_channels=block_out_channels[0],
                 num_groups=norm_num_groups,
-                epsilon=norm_eps, )
+                epsilon=norm_eps,
+            )
             self.conv_act = nn.Silu()
         else:
             self.conv_norm_out = None
@@ -598,16 +593,20 @@ def __init__(
             block_out_channels[0],
             out_channels,
             kernel_size=conv_out_kernel,
-            padding=conv_out_padding, )
+            padding=conv_out_padding,
+        )
 
     @property
     def in_channels(self):
         deprecate(
             "in_channels",
             "1.0.0",
-            ("Accessing `in_channels` directly via unet.in_channels is deprecated. Please use"
-             " `unet.config.in_channels` instead"),
-            standard_warn=False, )
+            (
+                "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use"
+                " `unet.config.in_channels` instead"
+            ),
+            standard_warn=False,
+        )
         return self.config.in_channels
 
     @property
@@ -620,16 +619,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         # set recursively
         processors = {}
 
-        def fn_recursive_add_processors(
-                name: str,
-                module: nn.Layer,
-                processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
             if hasattr(module, "set_processor"):
                 processors[f"{name}.processor"] = module.processor
 
             for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child,
-                                            processors)
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
 
             return processors
 
@@ -638,9 +633,7 @@ def fn_recursive_add_processors(
 
         return processors
 
-    def set_attn_processor(self,
-                           processor: Union[AttentionProcessor, Dict[
-                               str, AttentionProcessor]]):
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
         Parameters:
             `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@@ -665,8 +658,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
                     module.set_processor(processor.pop(f"{name}.processor"))
 
             for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child,
-                                            processor)
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
 
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
@@ -714,8 +706,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             # make smallest slice possible
             slice_size = num_sliceable_layers * [1]
 
-        slice_size = (num_sliceable_layers * [slice_size]
-                      if not isinstance(slice_size, list) else slice_size)
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
 
         if len(slice_size) != len(sliceable_head_dims):
             raise ValueError(
@@ -727,14 +718,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
             size = slice_size[i]
             dim = sliceable_head_dims[i]
             if size is not None and size > dim:
-                raise ValueError(
-                    f"size {size} has to be smaller or equal to {dim}.")
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
 
         # Recursively walk through all the children.
         # Any children which exposes the set_attention_slice method
         # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer,
-                                             slice_size: List[int]):
+        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
             if hasattr(module, "set_attention_slice"):
                 module.set_attention_slice(slice_size.pop())
 
@@ -747,24 +736,24 @@ def fn_recursive_set_attention_slice(module: nn.Layer,
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(
-                module,
-            (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat,
-             UpBlockFlat), ):
+            module,
+            (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, UpBlockFlat),
+        ):
             module.gradient_checkpointing = value
 
     def forward(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[paddle.Tensor, float, int],
-            encoder_hidden_states: paddle.Tensor,
-            class_labels: Optional[paddle.Tensor]=None,
-            timestep_cond: Optional[paddle.Tensor]=None,
-            attention_mask: Optional[paddle.Tensor]=None,
-            cross_attention_kwargs: Optional[Dict[str, Any]]=None,
-            down_block_additional_residuals: Optional[Tuple[
-                paddle.Tensor]]=None,
-            mid_block_additional_residual: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ) -> Union[UNet2DConditionOutput, Tuple]:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+        mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
         Args:
             sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
@@ -795,8 +784,7 @@ def forward(
         upsample_size = None
 
         if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info(
-                "Forward upsample size to force interpolation output size.")
+            logger.info("Forward upsample size to force interpolation output size.")
             forward_upsample_size = True
 
         # prepare attention_mask
@@ -816,7 +804,11 @@ def forward(
             timesteps = timesteps[None]
 
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand([sample.shape[0], ])
+        timesteps = timesteps.expand(
+            [
+                sample.shape[0],
+            ]
+        )
         t_emb = self.time_proj(timesteps)
 
         # timesteps does not contain any weights and will always return f32 tensors
@@ -828,8 +820,7 @@ def forward(
 
         if self.class_embedding is not None:
             if class_labels is None:
-                raise ValueError(
-                    "class_labels should be provided when num_class_embeds > 0")
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
 
             # maybe cast it to float16
             class_labels = class_labels.cast(self.dtype)
@@ -861,20 +852,15 @@ def forward(
 
         # 3. down
 
-        is_controlnet = (mid_block_additional_residual is not None and
-                         down_block_additional_residuals is not None)
-        is_adapter = (mid_block_additional_residual is None and
-                      down_block_additional_residuals is not None)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
 
-        down_block_res_samples = (sample, )
+        down_block_res_samples = (sample,)
         for downsample_block in self.down_blocks:
-            if (hasattr(downsample_block, "has_cross_attention") and
-                    downsample_block.has_cross_attention):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                 additional_kwargs = {}
                 if is_adapter and len(down_block_additional_residuals) > 0:
-                    additional_kwargs[
-                        "additional_residuals"] = down_block_additional_residuals.pop(
-                            0)
+                    additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0)
 
                 sample, res_samples = downsample_block(
                     hidden_states=sample,
@@ -882,10 +868,10 @@ def forward(
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    **additional_kwargs, )
+                    **additional_kwargs,
+                )
             else:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
                 if is_adapter and len(down_block_additional_residuals) > 0:
                     sample += down_block_additional_residuals.pop(0)
@@ -896,10 +882,10 @@ def forward(
             new_down_block_res_samples = ()
 
             for down_block_res_sample, down_block_additional_residual in zip(
-                    down_block_res_samples, down_block_additional_residuals):
-                down_block_res_sample = (
-                    down_block_res_sample + down_block_additional_residual)
-                new_down_block_res_samples += (down_block_res_sample, )
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
             down_block_res_samples = new_down_block_res_samples
 
         # 4. mid
@@ -909,7 +895,8 @@ def forward(
                 emb,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs, )
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
 
         if is_controlnet:
             sample = sample + mid_block_additional_residual
@@ -918,17 +905,15 @@ def forward(
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
 
-            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
-            down_block_res_samples = down_block_res_samples[:-len(
-                upsample_block.resnets)]
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
                 upsample_size = down_block_res_samples[-1].shape[2:]
 
-            if (hasattr(upsample_block, "has_cross_attention") and
-                    upsample_block.has_cross_attention):
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
                 sample = upsample_block(
                     hidden_states=sample,
                     temb=emb,
@@ -936,13 +921,15 @@ def forward(
                     encoder_hidden_states=encoder_hidden_states,
                     cross_attention_kwargs=cross_attention_kwargs,
                     upsample_size=upsample_size,
-                    attention_mask=attention_mask, )
+                    attention_mask=attention_mask,
+                )
             else:
                 sample = upsample_block(
                     hidden_states=sample,
                     temb=emb,
                     res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size, )
+                    upsample_size=upsample_size,
+                )
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
@@ -950,72 +937,60 @@ def forward(
         sample = self.conv_out(sample)
 
         if not return_dict:
-            return (sample, )
+            return (sample,)
 
         return UNet2DConditionOutput(sample=sample)
 
 
 class LinearMultiDim(nn.Linear):
-    def __init__(self,
-                 in_features,
-                 out_features=None,
-                 second_dim=4,
-                 *args,
-                 **kwargs):
-        in_features = ([in_features, second_dim, 1]
-                       if isinstance(in_features, int) else list(in_features))
+    def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs):
+        in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features)
         if out_features is None:
             out_features = in_features
-        out_features = ([out_features, second_dim, 1] if
-                        isinstance(out_features, int) else list(out_features))
+        out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features)
         self.in_features_multidim = in_features
         self.out_features_multidim = out_features
         self.n_dim = len(self.in_features_multidim)
-        super().__init__(
-            np.array(in_features).prod(), np.array(out_features).prod())
+        super().__init__(np.array(in_features).prod(), np.array(out_features).prod())
         self.in_features = self.weight.shape[0]
 
     def forward(self, input_tensor, *args, **kwargs):
         shape = input_tensor.shape
-        input_tensor = input_tensor.reshape(
-            [*shape[0:-self.n_dim], self.in_features])
+        input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_features])
         output_tensor = super().forward(input_tensor)
-        output_tensor = output_tensor.reshape(
-            [*shape[0:-self.n_dim], *self.out_features_multidim])
+        output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_features_multidim])
         return output_tensor
 
 
 class ResnetBlockFlat(nn.Layer):
     def __init__(
-            self,
-            *,
-            in_channels,
-            out_channels=None,
-            dropout: float=0.0,
-            temb_channels: int=512,
-            groups: int=32,
-            groups_out=None,
-            pre_norm: bool=True,
-            eps: float=1e-6,
-            time_embedding_norm: str="default",
-            use_in_shortcut=None,
-            second_dim: int=4,
-            pre_temb_non_linearity: bool=False,
-            **kwargs, ):
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out=None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        time_embedding_norm: str = "default",
+        use_in_shortcut=None,
+        second_dim: int = 4,
+        pre_temb_non_linearity: bool = False,
+        **kwargs,
+    ):
         super().__init__()
         self.pre_temb_non_linearity = pre_temb_non_linearity
         self.pre_norm = pre_norm
         self.pre_norm = True
 
-        in_channels = ([in_channels, second_dim, 1]
-                       if isinstance(in_channels, int) else list(in_channels))
+        in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels)
         self.in_channels_prod = np.array(in_channels).prod()
         self.channels_multidim = in_channels
 
         if out_channels is not None:
-            out_channels = ([out_channels, second_dim, 1]
-                            if isinstance(out_channels, int) else
-                            list(out_channels))
+            out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels)
             out_channels_prod = np.array(out_channels).prod()
             self.out_channels_multidim = out_channels
         else:
@@ -1026,26 +1001,23 @@ def __init__(
         if groups_out is None:
             groups_out = groups
 
-        self.norm1 = nn.GroupNorm(
-            num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps)
-        self.conv1 = nn.Conv2D(
-            self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+        self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps)
+        self.conv1 = nn.Conv2D(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
 
         if temb_channels is not None:
             self.time_emb_proj = nn.Linear(temb_channels, out_channels_prod)
         else:
             self.time_emb_proj = None
 
-        self.norm2 = nn.GroupNorm(
-            num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps)
+        self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps)
         self.dropout = nn.Dropout(dropout)
-        self.conv2 = nn.Conv2D(
-            out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2D(out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
 
         self.nonlinearity = nn.Silu()
 
-        self.use_in_shortcut = (self.in_channels_prod != out_channels_prod
-                                if use_in_shortcut is None else use_in_shortcut)
+        self.use_in_shortcut = (
+            self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut
+        )
 
         self.conv_shortcut = None
         if self.use_in_shortcut:
@@ -1054,14 +1026,14 @@ def __init__(
                 out_channels_prod,
                 kernel_size=1,
                 stride=1,
-                padding=0, )
+                padding=0,
+            )
         self.n_dim = len(self.channels_multidim)
 
     def forward(self, input_tensor, temb=None):
         shape = input_tensor.shape
 
-        input_tensor = input_tensor.reshape(
-            [*shape[0:-self.n_dim], self.in_channels_prod, 1, 1])
+        input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_channels_prod, 1, 1])
         input_tensor = input_tensor.reshape([-1, self.in_channels_prod, 1, 1])
 
         hidden_states = input_tensor
@@ -1072,8 +1044,7 @@ def forward(self, input_tensor, temb=None):
 
         if temb is not None and self.time_emb_proj is not None:
             if not self.pre_temb_non_linearity:
-                temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None,
-                                                                   None]
+                temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
             else:
                 temb = self.time_emb_proj(temb)[:, :, None, None]
             hidden_states = hidden_states + temb
@@ -1089,9 +1060,8 @@ def forward(self, input_tensor, temb=None):
 
         output_tensor = input_tensor + hidden_states
 
-        output_tensor = output_tensor.reshape([*shape[0:-self.n_dim], -1])
-        output_tensor = output_tensor.reshape(
-            [*shape[0:-self.n_dim], *self.out_channels_multidim])
+        output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], -1])
+        output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_channels_multidim])
 
         return output_tensor
 
@@ -1099,21 +1069,22 @@ def forward(self, input_tensor, temb=None):
 # Copied from ppdiffusers.models.unet_2d_blocks.DownBlock2D with DownBlock2D->DownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
 class DownBlockFlat(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_downsample: bool=True,
-            downsample_padding: int=1,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
@@ -1131,19 +1102,24 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                LinearMultiDim(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    LinearMultiDim(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
@@ -1153,8 +1129,7 @@ def forward(self, hidden_states, temb=None):
         output_states = ()
 
         for resnet in self.resnets:
-            if (self.training and self.gradient_checkpointing and
-                    not hidden_states.stop_gradient):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -1162,18 +1137,17 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
@@ -1181,27 +1155,28 @@ def custom_forward(*inputs):
 # Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnDownBlock2D with CrossAttnDownBlock2D->CrossAttnDownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
 class CrossAttnDownBlockFlat(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            cross_attention_dim: int=1280,
-            output_scale_factor: float=1.0,
-            downsample_padding: int=1,
-            add_downsample: bool=True,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            only_cross_attention: bool=False,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -1223,7 +1198,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             if not dual_cross_attention:
                 attentions.append(
                     Transformer2DModel(
@@ -1235,7 +1212,9 @@ def __init__(
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention, ))
+                        upcast_attention=upcast_attention,
+                    )
+                )
             else:
                 attentions.append(
                     DualTransformer2DModel(
@@ -1244,32 +1223,38 @@ def __init__(
                         in_channels=out_channels,
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_downsample:
-            self.downsamplers = nn.LayerList([
-                LinearMultiDim(
-                    out_channels,
-                    use_conv=True,
-                    out_channels=out_channels,
-                    padding=downsample_padding,
-                    name="op", )
-            ])
+            self.downsamplers = nn.LayerList(
+                [
+                    LinearMultiDim(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
         else:
             self.downsamplers = None
 
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None,
-            additional_residuals=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        additional_residuals=None,
+    ):
         # TODO(Patrick, William) - attention mask is not used
         output_states = ()
 
@@ -1285,22 +1270,22 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(
-                        attn, return_dict=False),
+                    create_custom_forward(attn, return_dict=False),
                     hidden_states,
                     encoder_hidden_states,
-                    cross_attention_kwargs, )  # [0]
+                    cross_attention_kwargs,
+                )  # [0]
             else:
                 hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         if additional_residuals is not None:
             hidden_states += additional_residuals
@@ -1309,7 +1294,7 @@ def custom_forward(*inputs):
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
-            output_states += (hidden_states, )
+            output_states += (hidden_states,)
 
         return hidden_states, output_states
 
@@ -1317,27 +1302,27 @@ def custom_forward(*inputs):
 # Copied from ppdiffusers.models.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
 class UpBlockFlat(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            prev_output_channel: int,
-            out_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -1352,31 +1337,25 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                LinearMultiDim(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
         self.gradient_checkpointing = False
 
-    def forward(self,
-                hidden_states,
-                res_hidden_states_tuple,
-                temb=None,
-                upsample_size=None):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             if self.training and self.gradient_checkpointing:
 
@@ -1386,8 +1365,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
                 hidden_states = resnet(hidden_states, temb)
 
@@ -1401,27 +1379,28 @@ def custom_forward(*inputs):
 # Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
 class CrossAttnUpBlockFlat(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            prev_output_channel: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            cross_attention_dim: int=1280,
-            output_scale_factor: float=1.0,
-            add_upsample: bool=True,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            only_cross_attention: bool=False,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
         resnets = []
         attentions = []
@@ -1430,8 +1409,7 @@ def __init__(
         self.attn_num_head_channels = attn_num_head_channels
 
         for i in range(num_layers):
-            res_skip_channels = in_channels if (
-                i == num_layers - 1) else out_channels
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
 
             resnets.append(
@@ -1446,7 +1424,9 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
             if not dual_cross_attention:
                 attentions.append(
                     Transformer2DModel(
@@ -1458,7 +1438,9 @@ def __init__(
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention, ))
+                        upcast_attention=upcast_attention,
+                    )
+                )
             else:
                 attentions.append(
                     DualTransformer2DModel(
@@ -1467,36 +1449,35 @@ def __init__(
                         in_channels=out_channels,
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
         if add_upsample:
-            self.upsamplers = nn.LayerList([
-                LinearMultiDim(
-                    out_channels, use_conv=True, out_channels=out_channels)
-            ])
+            self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
 
         self.gradient_checkpointing = False
 
     def forward(
-            self,
-            hidden_states,
-            res_hidden_states_tuple,
-            temb=None,
-            encoder_hidden_states=None,
-            cross_attention_kwargs=None,
-            upsample_size=None,
-            attention_mask=None, ):
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        cross_attention_kwargs=None,
+        upsample_size=None,
+        attention_mask=None,
+    ):
         # TODO(Patrick, William) - attention mask is not used
         for resnet, attn in zip(self.resnets, self.attentions):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(
-                [hidden_states, res_hidden_states], axis=1)
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             if self.training and self.gradient_checkpointing:
 
@@ -1509,20 +1490,20 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = recompute(
-                    create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(
-                        attn, return_dict=False),
+                    create_custom_forward(attn, return_dict=False),
                     hidden_states,
                     encoder_hidden_states,
-                    cross_attention_kwargs, )  # [0]
+                    cross_attention_kwargs,
+                )  # [0]
             else:
                 hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs, ).sample
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -1534,29 +1515,29 @@ def custom_forward(*inputs):
 # Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat
 class UNetMidBlockFlatCrossAttn(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            cross_attention_dim: int=1280,
-            dual_cross_attention: bool=False,
-            use_linear_projection: bool=False,
-            upcast_attention: bool=False,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
 
         self.has_cross_attention = True
         self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (resnet_groups if resnet_groups is not None else
-                         min(in_channels // 4, 32))
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
 
         # there is always at least one resnet
         resnets = [
@@ -1571,7 +1552,8 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         ]
         attentions = []
 
@@ -1586,7 +1568,9 @@ def __init__(
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
-                        upcast_attention=upcast_attention, ))
+                        upcast_attention=upcast_attention,
+                    )
+                )
             else:
                 attentions.append(
                     DualTransformer2DModel(
@@ -1595,7 +1579,9 @@ def __init__(
                         in_channels=in_channels,
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups, ))
+                        norm_num_groups=resnet_groups,
+                    )
+                )
             resnets.append(
                 ResnetBlockFlat(
                     in_channels=in_channels,
@@ -1608,24 +1594,28 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             hidden_states = attn(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs, ).sample
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -1634,30 +1624,30 @@ def forward(
 # Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DSimpleCrossAttn with UNetMidBlock2DSimpleCrossAttn->UNetMidBlockFlatSimpleCrossAttn, ResnetBlock2D->ResnetBlockFlat
 class UNetMidBlockFlatSimpleCrossAttn(nn.Layer):
     def __init__(
-            self,
-            in_channels: int,
-            temb_channels: int,
-            dropout: float=0.0,
-            num_layers: int=1,
-            resnet_eps: float=1e-6,
-            resnet_time_scale_shift: str="default",
-            resnet_act_fn: str="swish",
-            resnet_groups: int=32,
-            resnet_pre_norm: bool=True,
-            attn_num_head_channels: int=1,
-            output_scale_factor: float=1.0,
-            cross_attention_dim: int=1280,
-            skip_time_act=False,
-            only_cross_attention=False,
-            cross_attention_norm=None,
-            resnet_pre_temb_non_linearity: bool=False, ):
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act=False,
+        only_cross_attention=False,
+        cross_attention_norm=None,
+        resnet_pre_temb_non_linearity: bool = False,
+    ):
         super().__init__()
 
         self.has_cross_attention = True
 
         self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = (resnet_groups if resnet_groups is not None else
-                         min(in_channels // 4, 32))
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
 
         self.num_heads = in_channels // self.attn_num_head_channels
 
@@ -1674,7 +1664,8 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity, )
+                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         ]
         attentions = []
 
@@ -1696,7 +1687,9 @@ def __init__(
                     upcast_softmax=True,
                     only_cross_attention=only_cross_attention,
                     cross_attention_norm=cross_attention_norm,
-                    processor=processor, ))
+                    processor=processor,
+                )
+            )
             resnets.append(
                 ResnetBlockFlat(
                     in_channels=in_channels,
@@ -1710,20 +1703,22 @@ def __init__(
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
                     skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity, ))
+                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+                )
+            )
 
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
     def forward(
-            self,
-            hidden_states,
-            temb=None,
-            encoder_hidden_states=None,
-            attention_mask=None,
-            cross_attention_kwargs=None, ):
-        cross_attention_kwargs = (cross_attention_kwargs if
-                                  cross_attention_kwargs is not None else {})
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+    ):
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             # attn
@@ -1731,7 +1726,8 @@ def forward(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 attention_mask=attention_mask,
-                **cross_attention_kwargs, )
+                **cross_attention_kwargs,
+            )
 
             # resnet
             hidden_states = resnet(hidden_states, temb)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
index c09df819c2b79..43a40201892a1 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -18,21 +18,27 @@
 
 import paddle
 import PIL.Image
-from paddlenlp.transformers import (CLIPImageProcessor,
-                                    CLIPTextModelWithProjection, CLIPTokenizer,
-                                    CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging
 from ..pipeline_utils import DiffusionPipeline
 from .modeling_text_unet import UNetFlatConditionModel
-from .pipeline_versatile_diffusion_dual_guided import \
-    VersatileDiffusionDualGuidedPipeline
-from .pipeline_versatile_diffusion_image_variation import \
-    VersatileDiffusionImageVariationPipeline
-from .pipeline_versatile_diffusion_text_to_image import \
-    VersatileDiffusionTextToImagePipeline
+from .pipeline_versatile_diffusion_dual_guided import (
+    VersatileDiffusionDualGuidedPipeline,
+)
+from .pipeline_versatile_diffusion_image_variation import (
+    VersatileDiffusionImageVariationPipeline,
+)
+from .pipeline_versatile_diffusion_text_to_image import (
+    VersatileDiffusionTextToImagePipeline,
+)
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -77,15 +83,16 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
     scheduler: KarrasDiffusionSchedulers
 
     def __init__(
-            self,
-            tokenizer: CLIPTokenizer,
-            image_feature_extractor: CLIPImageProcessor,
-            text_encoder: CLIPTextModelWithProjection,
-            image_encoder: CLIPVisionModelWithProjection,
-            image_unet: UNet2DConditionModel,
-            text_unet: UNet2DConditionModel,
-            vae: AutoencoderKL,
-            scheduler: KarrasDiffusionSchedulers, ):
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
         super().__init__()
 
         self.register_modules(
@@ -96,27 +103,28 @@ def __init__(
             image_unet=image_unet,
             text_unet=text_unet,
             vae=vae,
-            scheduler=scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     @paddle.no_grad()
     def image_variation(
-            self,
-            image: Union[paddle.Tensor, PIL.Image.Image],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        image: Union[paddle.Tensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -194,13 +202,8 @@ def image_variation(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        expected_components = inspect.signature(
-            VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         return VersatileDiffusionImageVariationPipeline(**components)(
             image=image,
             height=height,
@@ -215,26 +218,27 @@ def image_variation(
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
-            callback_steps=callback_steps, )
+            callback_steps=callback_steps,
+        )
 
     @paddle.no_grad()
     def text_to_image(
-            self,
-            prompt: Union[str, List[str]],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -303,13 +307,8 @@ def text_to_image(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        expected_components = inspect.signature(
-            VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
         output = temp_pipeline(
             prompt=prompt,
@@ -325,7 +324,8 @@ def text_to_image(
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
-            callback_steps=callback_steps, )
+            callback_steps=callback_steps,
+        )
         # swap the attention blocks back to the original state
         temp_pipeline._swap_unet_attention_blocks()
 
@@ -333,23 +333,23 @@ def text_to_image(
 
     @paddle.no_grad()
     def dual_guided(
-            self,
-            prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
-            image: Union[str, List[str]],
-            text_to_image_strength: float=0.5,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ):
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -431,13 +431,8 @@ def dual_guided(
             returning a tuple, the first element is a list with the generated images.
         """
 
-        expected_components = inspect.signature(
-            VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
-        components = {
-            name: component
-            for name, component in self.components.items()
-            if name in expected_components
-        }
+        expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
         temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
         output = temp_pipeline(
             prompt=prompt,
@@ -454,7 +449,8 @@ def dual_guided(
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
-            callback_steps=callback_steps, )
+            callback_steps=callback_steps,
+        )
         temp_pipeline._revert_dual_attention()
 
         return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index a47088e2f9411..faf4c4f7232ed 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -19,12 +19,19 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor,
-                                    CLIPTextModelWithProjection, CLIPTokenizer,
-                                    CLIPVisionModelWithProjection)
-
-from ...models import (AutoencoderKL, DualTransformer2DModel,
-                       Transformer2DModel, UNet2DConditionModel)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+    AutoencoderKL,
+    DualTransformer2DModel,
+    Transformer2DModel,
+    UNet2DConditionModel,
+)
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -74,15 +81,16 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
     _optional_components = ["text_unet"]
 
     def __init__(
-            self,
-            tokenizer: CLIPTokenizer,
-            image_feature_extractor: CLIPImageProcessor,
-            text_encoder: CLIPTextModelWithProjection,
-            image_encoder: CLIPVisionModelWithProjection,
-            image_unet: UNet2DConditionModel,
-            text_unet: UNetFlatConditionModel,
-            vae: AutoencoderKL,
-            scheduler: KarrasDiffusionSchedulers, ):
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
         super().__init__()
         self.register_modules(
             tokenizer=tokenizer,
@@ -92,12 +100,13 @@ def __init__(
             image_unet=image_unet,
             text_unet=text_unet,
             vae=vae,
-            scheduler=scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
         if self.text_unet is not None and (
-                "dual_cross_attention" not in self.image_unet.config or
-                not self.image_unet.config.dual_cross_attention):
+            "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
+        ):
             # if loading from a universal checkpoint rather than a saved dual-guided pipeline
             self._convert_to_dual_attention()
 
@@ -114,10 +123,8 @@ def _convert_to_dual_attention(self):
                 parent_name, index = name.rsplit(".", 1)
                 index = int(index)
 
-                image_transformer = self.image_unet.get_sublayer(parent_name)[
-                    index]
-                text_transformer = self.text_unet.get_sublayer(parent_name)[
-                    index]
+                image_transformer = self.image_unet.get_sublayer(parent_name)[index]
+                text_transformer = self.text_unet.get_sublayer(parent_name)[index]
 
                 config = image_transformer.config
                 dual_transformer = DualTransformer2DModel(
@@ -132,12 +139,12 @@ def _convert_to_dual_attention(self):
                     sample_size=config.sample_size,
                     num_vector_embeds=config.num_vector_embeds,
                     activation_fn=config.activation_fn,
-                    num_embeds_ada_norm=config.num_embeds_ada_norm, )
+                    num_embeds_ada_norm=config.num_embeds_ada_norm,
+                )
                 dual_transformer.transformers[0] = image_transformer
                 dual_transformer.transformers[1] = text_transformer
 
-                self.image_unet.get_sublayer(parent_name)[
-                    index] = dual_transformer
+                self.image_unet.get_sublayer(parent_name)[index] = dual_transformer
                 self.image_unet.register_to_config(dual_cross_attention=True)
 
     def _revert_dual_attention(self):
@@ -149,12 +156,10 @@ def _revert_dual_attention(self):
             if isinstance(module, DualTransformer2DModel):
                 parent_name, index = name.rsplit(".", 1)
                 index = int(index)
-                self.image_unet.get_sublayer(parent_name)[
-                    index] = module.transformers[0]
+                self.image_unet.get_sublayer(parent_name)[index] = module.transformers[0]
         self.image_unet.register_to_config(dual_cross_attention=False)
 
-    def _encode_text_prompt(self, prompt, num_images_per_prompt,
-                            do_classifier_free_guidance):
+    def _encode_text_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -168,11 +173,9 @@ def _encode_text_prompt(self, prompt, num_images_per_prompt,
         """
 
         def normalize_embeddings(encoder_output):
-            embeds = paddle.matmul(encoder_output.last_hidden_state,
-                                   self.text_encoder.text_projection)
+            embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection)
             embeds_pooled = encoder_output.text_embeds
-            embeds = embeds / paddle.norm(
-                embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
+            embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
             return embeds
 
         batch_size = len(prompt)
@@ -182,35 +185,35 @@ def normalize_embeddings(encoder_output):
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(
-            prompt, padding="longest", return_tensors="pd").input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
 
-        if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                self.text_encoder.config.use_attention_mask):
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
             attention_mask = text_inputs.attention_mask
         else:
             attention_mask = None
 
         prompt_embeds = self.text_encoder(
             text_input_ids,
-            attention_mask=attention_mask, )
+            attention_mask=attention_mask,
+        )
         prompt_embeds = normalize_embeddings(prompt_embeds)
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
@@ -221,37 +224,33 @@ def normalize_embeddings(encoder_output):
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
-            negative_prompt_embeds = normalize_embeddings(
-                negative_prompt_embeds)
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
-    def _encode_image_prompt(self, prompt, num_images_per_prompt,
-                             do_classifier_free_guidance):
+    def _encode_image_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
         r"""
         Encodes the prompt into vision encoder hidden states.
 
@@ -265,8 +264,7 @@ def _encode_image_prompt(self, prompt, num_images_per_prompt,
         """
 
         def normalize_embeddings(encoder_output):
-            embeds = self.image_encoder.vision_model.ln_post(
-                encoder_output.last_hidden_state)
+            embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state)
             embeds = paddle.matmul(embeds, self.image_encoder.vision_projection)
             embeds_pooled = embeds[:, 0:1]
             embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True)
@@ -275,8 +273,7 @@ def normalize_embeddings(encoder_output):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         # get prompt text embeddings
-        image_input = self.image_feature_extractor(
-            images=prompt, return_tensors="pd")
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pd")
         pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype)
         image_embeddings = self.image_encoder(pixel_values)
         image_embeddings = normalize_embeddings(image_embeddings)
@@ -284,32 +281,25 @@ def normalize_embeddings(encoder_output):
         # duplicate image embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = image_embeddings.shape
         image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
             uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
-            uncond_images = self.image_feature_extractor(
-                images=uncond_images, return_tensors="pd")
-            pixel_values = uncond_images.pixel_values.cast(
-                self.image_encoder.dtype)
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd")
+            pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype)
             negative_prompt_embeds = self.image_encoder(pixel_values)
-            negative_prompt_embeds = normalize_embeddings(
-                negative_prompt_embeds)
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and conditional embeddings into a single batch
             # to avoid doing two forward passes
-            image_embeddings = paddle.concat(
-                [negative_prompt_embeds, image_embeddings])
+            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
 
         return image_embeddings
 
@@ -329,60 +319,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     def check_inputs(self, prompt, image, height, width, callback_steps):
-        if (not isinstance(prompt, str) and
-                not isinstance(prompt, PIL.Image.Image) and
-                not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}"
-            )
-        if (not isinstance(image, str) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
-            raise ValueError(
-                f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}"
-            )
+        if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}")
+        if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list):
+            raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}")
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = (
             batch_size,
             num_channels_latents,
             height // self.vae_scale_factor,
-            width // self.vae_scale_factor, )
+            width // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -396,44 +377,39 @@ def prepare_latents(
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    def set_transformer_params(self,
-                               mix_ratio: float=0.5,
-                               condition_types: Tuple=("text", "image")):
+    def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")):
         for name, module in self.image_unet.named_sublayers(include_self=True):
             if isinstance(module, DualTransformer2DModel):
                 module.mix_ratio = mix_ratio
 
                 for i, type in enumerate(condition_types):
                     if type == "text":
-                        module.condition_lengths[
-                            i] = self.text_encoder.config.max_position_embeddings
-                        module.transformer_index_for_condition[
-                            i] = 1  # use the second (text) transformer
+                        module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings
+                        module.transformer_index_for_condition[i] = 1  # use the second (text) transformer
                     else:
                         module.condition_lengths[i] = 257
-                        module.transformer_index_for_condition[
-                            i] = 0  # use the first (image) transformer
+                        module.transformer_index_for_condition[i] = 0  # use the first (image) transformer
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
-            image: Union[str, List[str]],
-            text_to_image_strength: float=0.5,
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -532,12 +508,9 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompts
-        prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt,
-                                                 do_classifier_free_guidance)
-        image_embeddings = self._encode_image_prompt(
-            image, num_images_per_prompt, do_classifier_free_guidance)
-        dual_prompt_embeddings = paddle.concat(
-            [prompt_embeds, image_embeddings], axis=1)
+        prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
+        image_embeddings = self._encode_image_prompt(image, num_images_per_prompt, do_classifier_free_guidance)
+        dual_prompt_embeddings = paddle.concat([prompt_embeds, image_embeddings], axis=1)
         prompt_types = ("text", "image")
 
         # 4. Prepare timesteps
@@ -553,7 +526,8 @@ def __call__(
             width,
             dual_prompt_embeddings.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -564,26 +538,19 @@ def __call__(
         # 8. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.image_unet(
-                latent_model_input,
-                t,
-                encoder_hidden_states=dual_prompt_embeddings).sample
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -597,6 +564,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 668f748dfa42a..fc9d645fc7991 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -19,8 +19,7 @@
 import numpy as np
 import paddle
 import PIL
-from paddlenlp.transformers import (CLIPImageProcessor,
-                                    CLIPVisionModelWithProjection)
+from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -57,27 +56,30 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
     scheduler: KarrasDiffusionSchedulers
 
     def __init__(
-            self,
-            image_feature_extractor: CLIPImageProcessor,
-            image_encoder: CLIPVisionModelWithProjection,
-            image_unet: UNet2DConditionModel,
-            vae: AutoencoderKL,
-            scheduler: KarrasDiffusionSchedulers, ):
+        self,
+        image_feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
         super().__init__()
         self.register_modules(
             image_feature_extractor=image_feature_extractor,
             image_encoder=image_encoder,
             image_unet=image_unet,
             vae=vae,
-            scheduler=scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     def _encode_image_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -94,8 +96,7 @@ def _encode_image_prompt(
         """
 
         def normalize_embeddings(encoder_output):
-            embeds = self.image_encoder.vision_model.ln_post(
-                encoder_output.last_hidden_state)
+            embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state)
             embeds = paddle.matmul(embeds, self.image_encoder.vision_projection)
             embeds_pooled = embeds[:, 0:1]
             embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True)
@@ -107,8 +108,7 @@ def normalize_embeddings(encoder_output):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         # get prompt text embeddings
-        image_input = self.image_feature_extractor(
-            images=prompt, return_tensors="pd")
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pd")
         pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype)
         image_embeddings = self.image_encoder(pixel_values)
         image_embeddings = normalize_embeddings(image_embeddings)
@@ -116,8 +116,7 @@ def normalize_embeddings(encoder_output):
         # duplicate image embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = image_embeddings.shape
         image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
@@ -127,37 +126,33 @@ def normalize_embeddings(encoder_output):
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, PIL.Image.Image):
                 uncond_images = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_images = negative_prompt
 
-            uncond_images = self.image_feature_extractor(
-                images=uncond_images, return_tensors="pd")
-            pixel_values = uncond_images.pixel_values.cast(
-                self.image_encoder.dtype)
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd")
+            pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype)
             negative_prompt_embeds = self.image_encoder(pixel_values)
-            negative_prompt_embeds = normalize_embeddings(
-                negative_prompt_embeds)
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and conditional embeddings into a single batch
             # to avoid doing two forward passes
-            image_embeddings = paddle.concat(
-                [negative_prompt_embeds, image_embeddings])
+            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
 
         return image_embeddings
 
@@ -177,50 +172,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
     def check_inputs(self, image, height, width, callback_steps):
-        if (not isinstance(image, paddle.Tensor) and
-                not isinstance(image, PIL.Image.Image) and
-                not isinstance(image, list)):
+        if (
+            not isinstance(image, paddle.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
             raise ValueError(
                 "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}")
+                f" {type(image)}"
+            )
 
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -242,23 +238,23 @@ def prepare_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -352,8 +348,8 @@ def __call__(
 
         # 3. Encode input prompt
         image_embeddings = self._encode_image_prompt(
-            image, num_images_per_prompt, do_classifier_free_guidance,
-            negative_prompt)
+            image, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -368,7 +364,8 @@ def __call__(
             width,
             image_embeddings.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -376,25 +373,19 @@ def __call__(
         # 7. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.image_unet(
-                latent_model_input, t,
-                encoder_hidden_states=image_embeddings).sample
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -408,6 +399,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index 1524df9f993ed..0d4999c94b24c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -17,8 +17,11 @@
 from typing import Callable, List, Optional, Union
 
 import paddle
-from paddlenlp.transformers import (CLIPImageProcessor,
-                                    CLIPTextModelWithProjection, CLIPTokenizer)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
 
 from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
@@ -67,13 +70,14 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
     _optional_components = ["text_unet"]
 
     def __init__(
-            self,
-            tokenizer: CLIPTokenizer,
-            text_encoder: CLIPTextModelWithProjection,
-            image_unet: UNet2DConditionModel,
-            text_unet: UNetFlatConditionModel,
-            vae: AutoencoderKL,
-            scheduler: KarrasDiffusionSchedulers, ):
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
         super().__init__()
         self.register_modules(
             tokenizer=tokenizer,
@@ -81,8 +85,9 @@ def __init__(
             image_unet=image_unet,
             text_unet=text_unet,
             vae=vae,
-            scheduler=scheduler, )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
         if self.text_unet is not None:
             self._swap_unet_attention_blocks()
@@ -97,19 +102,22 @@ def _swap_unet_attention_blocks(self):
                 index = int(index)
                 (
                     self.image_unet.get_sublayer(parent_name)[index],
-                    self.text_unet.get_sublayer(parent_name)[index], ) = (
-                        self.text_unet.get_sublayer(parent_name)[index],
-                        self.image_unet.get_sublayer(parent_name)[index], )
+                    self.text_unet.get_sublayer(parent_name)[index],
+                ) = (
+                    self.text_unet.get_sublayer(parent_name)[index],
+                    self.image_unet.get_sublayer(parent_name)[index],
+                )
 
     def remove_unused_weights(self):
         self.register_modules(text_unet=None)
 
     def _encode_prompt(
-            self,
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt, ):
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+    ):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -126,11 +134,9 @@ def _encode_prompt(
         """
 
         def normalize_embeddings(encoder_output):
-            embeds = paddle.matmul(encoder_output.last_hidden_state,
-                                   self.text_encoder.text_projection)
+            embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection)
             embeds_pooled = encoder_output.text_embeds
-            embeds = embeds / paddle.norm(
-                embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
+            embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
             return embeds
 
         batch_size = len(prompt) if isinstance(prompt, list) else 1
@@ -140,35 +146,35 @@ def normalize_embeddings(encoder_output):
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(
-            prompt, padding="longest", return_tensors="pd").input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
 
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                -1] and not paddle.equal_all(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
 
-        if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                self.text_encoder.config.use_attention_mask):
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
             attention_mask = text_inputs.attention_mask
         else:
             attention_mask = None
 
         prompt_embeds = self.text_encoder(
             text_input_ids,
-            attention_mask=attention_mask, )
+            attention_mask=attention_mask,
+        )
         prompt_embeds = normalize_embeddings(prompt_embeds)
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
@@ -178,14 +184,16 @@ def normalize_embeddings(encoder_output):
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
+                    f" {type(prompt)}."
+                )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
+                    " the batch size of `prompt`."
+                )
             else:
                 uncond_tokens = negative_prompt
 
@@ -195,32 +203,29 @@ def normalize_embeddings(encoder_output):
                 padding="max_length",
                 max_length=max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
 
-            if (hasattr(self.text_encoder.config, "use_attention_mask") and
-                    self.text_encoder.config.use_attention_mask):
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
                 uncond_input.input_ids,
-                attention_mask=attention_mask, )
-            negative_prompt_embeds = normalize_embeddings(
-                negative_prompt_embeds)
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
@@ -240,54 +245,50 @@ def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
 
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
-            self,
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt=None,
-            prompt_embeds=None,
-            negative_prompt_embeds=None, ):
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
-            )
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two.")
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and
-                                     not isinstance(prompt, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -300,18 +301,20 @@ def check_inputs(
                 raise ValueError(
                     "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}.")
+                    f" {negative_prompt_embeds.shape}."
+                )
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-            self,
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents=None, ):
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
         shape = [
             batch_size,
             num_channels_latents,
@@ -333,23 +336,23 @@ def prepare_latents(
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            height: Optional[int]=None,
-            width: Optional[int]=None,
-            num_inference_steps: int=50,
-            guidance_scale: float=7.5,
-            negative_prompt: Optional[Union[str, List[str]]]=None,
-            num_images_per_prompt: Optional[int]=1,
-            eta: float=0.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1,
-            **kwargs, ):
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
         r"""
         Function invoked when calling the pipeline for generation.
 
@@ -434,9 +437,9 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt,
-                                            do_classifier_free_guidance,
-                                            negative_prompt)
+        prompt_embeds = self._encode_prompt(
+            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -451,7 +454,8 @@ def __call__(
             width,
             prompt_embeds.dtype,
             generator,
-            latents, )
+            latents,
+        )
 
         # 6. Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -459,25 +463,19 @@ def __call__(
         # 7. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([latents] * 2)
-                                  if do_classifier_free_guidance else latents)
-            latent_model_input = self.scheduler.scale_model_input(
-                latent_model_input, t)
+            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
-            noise_pred = self.image_unet(
-                latent_model_input, t,
-                encoder_hidden_states=prompt_embeds).sample
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
 
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents,
-                                          **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -491,6 +489,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
index 4a1b00a7eb0fa..f7426c40427c0 100644
--- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
+++ b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
@@ -17,5 +17,7 @@
 from ...utils import is_paddle_available, is_paddlenlp_available
 
 if is_paddle_available() and is_paddlenlp_available():
-    from .pipeline_vq_diffusion import (LearnedClassifierFreeSamplingEmbeddings,
-                                        VQDiffusionPipeline)
+    from .pipeline_vq_diffusion import (
+        LearnedClassifierFreeSamplingEmbeddings,
+        VQDiffusionPipeline,
+    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
index f8d1fc09518db..e97be223237f9 100644
--- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
@@ -42,23 +42,23 @@ class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            learnable: bool,
-            hidden_size: Optional[int]=None,
-            length: Optional[int]=None, ):
+        self,
+        learnable: bool,
+        hidden_size: Optional[int] = None,
+        length: Optional[int] = None,
+    ):
         super().__init__()
 
         self.learnable = learnable
 
         if self.learnable:
-            assert (hidden_size is not None
-                    ), "learnable=True requires `hidden_size` to be set"
+            assert hidden_size is not None, "learnable=True requires `hidden_size` to be set"
             assert length is not None, "learnable=True requires `length` to be set"
 
             embeddings = paddle.zeros([length, hidden_size])
             self.embeddings = self.create_parameter(
-                embeddings.shape,
-                default_initializer=nn.initializer.Assign(embeddings))
+                embeddings.shape, default_initializer=nn.initializer.Assign(embeddings)
+            )
         else:
             self.embeddings = None
 
@@ -95,13 +95,13 @@ class VQDiffusionPipeline(DiffusionPipeline):
     scheduler: VQDiffusionScheduler
 
     def __init__(
-            self,
-            vqvae: VQModel,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            transformer: Transformer2DModel,
-            scheduler: VQDiffusionScheduler,
-            learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
+        self,
+        vqvae: VQModel,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        transformer: Transformer2DModel,
+        scheduler: VQDiffusionScheduler,
+        learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
     ):
         super().__init__()
 
@@ -114,8 +114,7 @@ def __init__(
             learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
         )
 
-    def _encode_prompt(self, prompt, num_images_per_prompt,
-                       do_classifier_free_guidance):
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         # get prompt text embeddings
@@ -123,16 +122,17 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_input_ids = text_inputs.input_ids
 
         if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(
-                text_input_ids[:, self.tokenizer.model_max_length:])
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         prompt_embeds = self.text_encoder(text_input_ids)[0]
 
         # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
@@ -141,21 +141,17 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
         #
         # CLIP normalizing the pooled output.
         # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
-        prompt_embeds = prompt_embeds / prompt_embeds.norm(
-            axis=-1, keepdim=True)
+        prompt_embeds = prompt_embeds / prompt_embeds.norm(axis=-1, keepdim=True)
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape(
-            [bs_embed * num_images_per_prompt, seq_len, -1])
+        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
 
         if do_classifier_free_guidance:
             if self.learned_classifier_free_sampling_embeddings.learnable:
-                negative_prompt_embeds = (
-                    self.learned_classifier_free_sampling_embeddings.embeddings)
-                negative_prompt_embeds = negative_prompt_embeds.unsqueeze(
-                    0).tile([batch_size, 1, 1])
+                negative_prompt_embeds = self.learned_classifier_free_sampling_embeddings.embeddings
+                negative_prompt_embeds = negative_prompt_embeds.unsqueeze(0).tile([batch_size, 1, 1])
             else:
                 uncond_tokens = [""] * batch_size
 
@@ -165,45 +161,39 @@ def _encode_prompt(self, prompt, num_images_per_prompt,
                     padding="max_length",
                     max_length=max_length,
                     truncation=True,
-                    return_tensors="pd", )
-                negative_prompt_embeds = self.text_encoder(
-                    uncond_input.input_ids)[0]
+                    return_tensors="pd",
+                )
+                negative_prompt_embeds = self.text_encoder(uncond_input.input_ids)[0]
                 # See comment for normalizing text embeddings
-                negative_prompt_embeds = (negative_prompt_embeds /
-                                          negative_prompt_embeds.norm(
-                                              axis=-1, keepdim=True))
+                negative_prompt_embeds = negative_prompt_embeds / negative_prompt_embeds.norm(axis=-1, keepdim=True)
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile(
-                [1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1])
+            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(
-                [negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         return prompt_embeds
 
     @paddle.no_grad()
     def __call__(
-            self,
-            prompt: Union[str, List[str]],
-            num_inference_steps: int=100,
-            guidance_scale: float=5.0,
-            truncation_rate: float=1.0,
-            num_images_per_prompt: int=1,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            latents: Optional[paddle.Tensor]=None,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None,
-            callback_steps: Optional[int]=1, ) -> Union[ImagePipelineOutput,
-                                                        Tuple]:
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 100,
+        guidance_scale: float = 5.0,
+        truncation_rate: float = 1.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        latents: Optional[paddle.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
 
@@ -252,23 +242,21 @@ def __call__(
         elif isinstance(prompt, list):
             batch_size = len(prompt)
         else:
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         batch_size = batch_size * num_images_per_prompt
 
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt,
-                                            do_classifier_free_guidance)
+        prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
 
         if (callback_steps is None) or (
-                callback_steps is not None and
-            (not isinstance(callback_steps, int) or callback_steps <= 0)):
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}.")
+                f" {type(callback_steps)}."
+            )
 
         # get the initial completely masked latents unless the user supplied it
 
@@ -278,14 +266,12 @@ def __call__(
             latents = paddle.full(latents_shape, mask_class, dtype="int64")
         else:
             if latents.shape != latents_shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
-                )
-            if (latents < 0).any() or (
-                    latents >= self.transformer.num_vector_embeds).any():
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
                 raise ValueError(
                     "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
-                    f" {self.transformer.num_vector_embeds - 1} (inclusive).")
+                    f" {self.transformer.num_vector_embeds - 1} (inclusive)."
+                )
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -296,20 +282,15 @@ def __call__(
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the sample if we are doing classifier free guidance
-            latent_model_input = (paddle.concat([sample] * 2)
-                                  if do_classifier_free_guidance else sample)
+            latent_model_input = paddle.concat([sample] * 2) if do_classifier_free_guidance else sample
 
             # predict the un-noised image
             # model_output == `log_p_x_0`
-            model_output = self.transformer(
-                latent_model_input,
-                encoder_hidden_states=prompt_embeds,
-                timestep=t).sample
+            model_output = self.transformer(latent_model_input, encoder_hidden_states=prompt_embeds, timestep=t).sample
 
             if do_classifier_free_guidance:
                 model_output_uncond, model_output_text = model_output.chunk(2)
-                model_output = model_output_uncond + guidance_scale * (
-                    model_output_text - model_output_uncond)
+                model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond)
                 model_output -= logsumexp(model_output, axis=1, keepdim=True)
 
             model_output = self.truncate(model_output, truncation_rate)
@@ -318,9 +299,7 @@ def __call__(
             model_output = model_output.clip(-70)
 
             # compute the previous noisy sample x_t -> x_t-1
-            sample = self.scheduler.step(
-                model_output, timestep=t, sample=sample,
-                generator=generator).prev_sample
+            sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -331,9 +310,9 @@ def __call__(
             batch_size,
             self.transformer.height,
             self.transformer.width,
-            embedding_channels, )
-        embeddings = self.vqvae.quantize.get_codebook_entry(
-            sample, shape=embeddings_shape)
+            embedding_channels,
+        )
+        embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape)
         image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
 
         image = (image / 2 + 0.5).clip(0, 1)
@@ -343,34 +322,29 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, )
+            return (image,)
 
         return ImagePipelineOutput(images=image)
 
-    def truncate(self, log_p_x_0: paddle.Tensor,
-                 truncation_rate: float) -> paddle.Tensor:
+    def truncate(self, log_p_x_0: paddle.Tensor, truncation_rate: float) -> paddle.Tensor:
         """
         Truncates log_p_x_0 such that for each column vector, the total cumulative probability is `truncation_rate` The
         lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to zero.
         """
-        sorted_log_p_x_0, indices = paddle.topk(
-            log_p_x_0, k=log_p_x_0.shape[1], axis=1)
+        sorted_log_p_x_0, indices = paddle.topk(log_p_x_0, k=log_p_x_0.shape[1], axis=1)
         sorted_p_x_0 = paddle.exp(sorted_log_p_x_0)
-        keep_mask = (
-            sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64")
+        keep_mask = (sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64")
 
         # Ensure that at least the largest probability is not zeroed out
         all_true = paddle.full_like(keep_mask[:, 0:1, :], 1)
         keep_mask = paddle.concat((all_true, keep_mask), axis=1)
         keep_mask = keep_mask[:, :-1, :]
 
-        keep_mask = paddle.take_along_axis(
-            keep_mask, indices.argsort(1),
-            axis=1).cast("bool")  # keep_mask.gather(indices.argsort(1), axis=1)
+        keep_mask = paddle.take_along_axis(keep_mask, indices.argsort(1), axis=1).cast(
+            "bool"
+        )  # keep_mask.gather(indices.argsort(1), axis=1)
         rv = log_p_x_0.clone()
         # rv[~keep_mask] = -INF  # -inf = log(0)
-        rv = paddle.where(
-            keep_mask, rv, paddle.to_tensor(
-                -INF, dtype="float32"))
+        rv = paddle.where(keep_mask, rv, paddle.to_tensor(-INF, dtype="float32"))
 
         return rv
diff --git a/ppdiffusers/ppdiffusers/schedulers/__init__.py b/ppdiffusers/ppdiffusers/schedulers/__init__.py
index dd064c0187497..682e58fcc57df 100644
--- a/ppdiffusers/ppdiffusers/schedulers/__init__.py
+++ b/ppdiffusers/ppdiffusers/schedulers/__init__.py
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils import (OptionalDependencyNotAvailable, is_paddle_available,
-                     is_scipy_available)
+from ..utils import (
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_scipy_available,
+)
 
 try:
     if not is_paddle_available():
@@ -22,8 +25,9 @@
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_objects import *  # noqa F403
 else:
-    from .preconfig.preconfig_scheduling_euler_ancestral_discrete import \
-        PreconfigEulerAncestralDiscreteScheduler
+    from .preconfig.preconfig_scheduling_euler_ancestral_discrete import (
+        PreconfigEulerAncestralDiscreteScheduler,
+    )
     from .scheduling_ddim import DDIMScheduler
     from .scheduling_ddim_inverse import DDIMInverseScheduler
     from .scheduling_ddpm import DDPMScheduler
@@ -31,13 +35,11 @@
     from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
     from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
     from .scheduling_dpmsolver_unidiffuser import DPMSolverUniDiffuserScheduler
-    from .scheduling_euler_ancestral_discrete import \
-        EulerAncestralDiscreteScheduler
+    from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
     from .scheduling_euler_discrete import EulerDiscreteScheduler
     from .scheduling_heun_discrete import HeunDiscreteScheduler
     from .scheduling_ipndm import IPNDMScheduler
-    from .scheduling_k_dpm_2_ancestral_discrete import \
-        KDPM2AncestralDiscreteScheduler
+    from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
     from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
     from .scheduling_karras_ve import KarrasVeScheduler
     from .scheduling_pndm import PNDMScheduler
@@ -55,6 +57,7 @@
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_paddle_and_scipy_objects import *  # noqa F403
 else:
-    from .preconfig.preconfig_scheduling_lms_discrete import \
-        PreconfigLMSDiscreteScheduler
+    from .preconfig.preconfig_scheduling_lms_discrete import (
+        PreconfigLMSDiscreteScheduler,
+    )
     from .scheduling_lms_discrete import LMSDiscreteScheduler
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
index 0af0ad582bd99..ecff93753b32d 100644
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
+++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 # flake8: noqa
 
-from ...utils import (OptionalDependencyNotAvailable, is_paddle_available,
-                      is_scipy_available)
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_paddle_available,
+    is_scipy_available,
+)
 
 try:
     if not is_paddle_available():
@@ -23,13 +26,13 @@
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_objects import *  # noqa F403
 else:
-    from .preconfig_scheduling_euler_ancestral_discrete import \
-        PreconfigEulerAncestralDiscreteScheduler
+    from .preconfig_scheduling_euler_ancestral_discrete import (
+        PreconfigEulerAncestralDiscreteScheduler,
+    )
 try:
     if not (is_paddle_available() and is_scipy_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_paddle_and_scipy_objects import *  # noqa F403
 else:
-    from .preconfig_scheduling_lms_discrete import \
-        PreconfigLMSDiscreteScheduler
+    from .preconfig_scheduling_lms_discrete import PreconfigLMSDiscreteScheduler
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
index 53de9a57c4178..a925526d76b33 100644
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
@@ -47,8 +47,7 @@ class PreconfigEulerAncestralDiscreteSchedulerOutput(BaseOutput):
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -108,38 +107,40 @@ class PreconfigEulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon",
-            preconfig: bool=True, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        preconfig: bool = True,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
 
@@ -148,18 +149,15 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
         self.is_scale_input_called = False
         self.preconfig = preconfig
         self.step_index_offset = 0
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Union[float, paddle.Tensor],
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(
+        self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
+    ) -> paddle.Tensor:
         """
         Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
 
@@ -178,7 +176,7 @@ def scale_model_input(self,
 
         if not self.preconfig:
             sigma = self.sigmas[step_index]
-            sample = sample / ((sigma**2 + 1)**0.5)
+            sample = sample / ((sigma**2 + 1) ** 0.5)
             return sample
         else:
             if step_index > (len(self.latent_scales) - 1):
@@ -196,13 +194,8 @@ def set_timesteps(self, num_inference_steps: int):
         self.num_inference_steps = num_inference_steps
         self.step_index_offset = 0
 
-        timesteps = np.linspace(
-            0,
-            self.config.num_train_timesteps - 1,
-            num_inference_steps,
-            dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
@@ -213,23 +206,21 @@ def set_timesteps(self, num_inference_steps: int):
             for step_index_i in range(len(self.timesteps)):
                 sigma_from = self.sigmas[step_index_i]
                 sigma_to = self.sigmas[step_index_i + 1]
-                sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) /
-                            sigma_from**2)**0.5
-                sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+                sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
                 self.sigma_up.append(sigma_up)
                 self.sigma_down.append(sigma_down)
-            self.latent_scales = 1 / ((self.sigmas**2 + 1)**0.5)
+            self.latent_scales = 1 / ((self.sigmas**2 + 1) ** 0.5)
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor],
-            sample: paddle.Tensor,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True,
-            **kwargs, ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput,
-                                 Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+        sample: paddle.Tensor,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -251,7 +242,8 @@ def step(
         if not self.is_scale_input_called:
             logger.warning(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example.")
+                "See `StableDiffusionPipeline` for a usage example."
+            )
         if kwargs.get("return_pred_original_sample") is not None:
             return_pred_original_sample = kwargs["return_pred_original_sample"]
         else:
@@ -270,11 +262,9 @@ def step(
                 pred_original_sample = sample - sigma * model_output
             elif self.config.prediction_type == "v_prediction":
                 # * c_out + input * c_skip
-                pred_original_sample = model_output * (-sigma / (
-                    sigma**2 + 1)**0.5) + (sample / (sigma**2 + 1))
+                pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
             elif self.config.prediction_type == "sample":
-                raise NotImplementedError(
-                    "prediction_type not implemented yet: sample")
+                raise NotImplementedError("prediction_type not implemented yet: sample")
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -283,38 +273,37 @@ def step(
         if not self.preconfig:
             sigma_from = self.sigmas[step_index]
             sigma_to = self.sigmas[step_index + 1]
-            sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from
-                        **2)**0.5
-            sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+            sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+            sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
         else:
             sigma_up = self.sigma_up[step_index]
             sigma_down = self.sigma_down[step_index]
         # 2. Convert to an ODE derivative
         dt = sigma_down - sigma
         prev_sample = sample + derivative * dt
-        noise = randn_tensor(
-            model_output.shape, dtype=model_output.dtype, generator=generator)
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
         prev_sample = prev_sample + noise * sigma_up
         if not return_dict:
             if not return_pred_original_sample:
-                return (prev_sample, )
+                return (prev_sample,)
             else:
                 return (prev_sample, pred_original_sample)
 
         return PreconfigEulerAncestralDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         self.sigmas = self.sigmas.cast(original_samples.dtype)
 
         schedule_timesteps = self.timesteps
-        step_indices = [(schedule_timesteps == t).nonzero().item()
-                        for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = self.sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
index dd6c73e2e7250..16f74fcb6860f 100644
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
@@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -106,38 +106,40 @@ class PreconfigLMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon",
-            preconfig=True, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        preconfig=True,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
 
@@ -146,18 +148,15 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
         self.derivatives = []
         self.is_scale_input_called = False
         self.preconfig = preconfig
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Union[float, paddle.Tensor],
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(
+        self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
+    ) -> paddle.Tensor:
         """
         Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
 
@@ -175,7 +174,7 @@ def scale_model_input(self,
         self.is_scale_input_called = True
         if not self.preconfig:
             sigma = self.sigmas[step_index]
-            sample = sample / ((sigma**2 + 1)**0.5)
+            sample = sample / ((sigma**2 + 1) ** 0.5)
             return sample
         else:
             return sample * self.latent_scales[step_index]
@@ -195,16 +194,14 @@ def lms_derivative(tau):
             for k in range(order):
                 if current_order == k:
                     continue
-                prod *= (tau - self.sigmas[t - k]) / (
-                    self.sigmas[t - current_order] - self.sigmas[t - k])
+                prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
             return prod
 
-        integrated_coeff = integrate.quad(
-            lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+        integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
 
         return integrated_coeff
 
-    def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4):
+    def set_timesteps(self, num_inference_steps: int, preconfig_order: int = 4):
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -214,13 +211,8 @@ def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4):
         """
         self.num_inference_steps = num_inference_steps
 
-        timesteps = np.linspace(
-            0,
-            self.config.num_train_timesteps - 1,
-            num_inference_steps,
-            dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
@@ -230,24 +222,22 @@ def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4):
         if self.preconfig:
             self.order = preconfig_order
             self.lms_coeffs = []
-            self.latent_scales = [
-                1.0 / ((sigma**2 + 1)**0.5) for sigma in self.sigmas
-            ]
+            self.latent_scales = [1.0 / ((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
             for step_index in range(self.num_inference_steps):
                 order = min(step_index + 1, preconfig_order)
-                self.lms_coeffs.append([
-                    self.get_lms_coefficient(order, step_index, curr_order)
-                    for curr_order in range(order)
-                ])
+                self.lms_coeffs.append(
+                    [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
+                )
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor],
-            sample: paddle.Tensor,
-            order: int=4,
-            return_dict: bool=True,
-            **kwargs, ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+        sample: paddle.Tensor,
+        order: int = 4,
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -272,7 +262,8 @@ def step(
         if not self.is_scale_input_called:
             warnings.warn(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example.")
+                "See `StableDiffusionPipeline` for a usage example."
+            )
         if kwargs.get("return_pred_original_sample") is not None:
             return_pred_original_sample = kwargs["return_pred_original_sample"]
         else:
@@ -292,8 +283,7 @@ def step(
                 pred_original_sample = sample - sigma * model_output
             elif self.config.prediction_type == "v_prediction":
                 # * c_out + input * c_skip
-                pred_original_sample = model_output * (-sigma / (
-                    sigma**2 + 1)**0.5) + (sample / (sigma**2 + 1))
+                pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
             elif self.config.prediction_type == "sample":
                 pred_original_sample = model_output
             else:
@@ -310,42 +300,37 @@ def step(
         if not self.preconfig:
             # 3. If not preconfiged, compute linear multistep coefficients.
             order = min(step_index + 1, order)
-            lms_coeffs = [
-                self.get_lms_coefficient(order, step_index, curr_order)
-                for curr_order in range(order)
-            ]
+            lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
             # 4. Compute previous sample based on the derivatives path
             prev_sample = sample + sum(
-                coeff * derivative
-                for coeff, derivative in zip(lms_coeffs,
-                                             reversed(self.derivatives)))
+                coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+            )
         else:
             # 3. If preconfiged, direct compute previous sample based on the derivatives path
             prev_sample = sample + sum(
                 coeff * derivative
-                for coeff, derivative in zip(self.lms_coeffs[step_index],
-                                             reversed(self.derivatives)))
+                for coeff, derivative in zip(self.lms_coeffs[step_index], reversed(self.derivatives))
+            )
 
         if not return_dict:
             if not return_pred_original_sample:
-                return (prev_sample, )
+                return (prev_sample,)
             else:
                 return (prev_sample, pred_original_sample)
 
-        return PreconfigLMSDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+        return PreconfigLMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
         schedule_timesteps = self.timesteps
 
-        step_indices = [(schedule_timesteps == t).nonzero().item()
-                        for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
index b4929d761f687..9bb46c472ca10 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
@@ -48,8 +48,7 @@ class DDIMSchedulerOutput(BaseOutput):
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -68,7 +67,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -131,38 +130,41 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            clip_sample: bool=True,
-            set_alpha_to_one: bool=True,
-            steps_offset: int=0,
-            prediction_type: str="epsilon",
-            thresholding: bool=False,
-            dynamic_thresholding_ratio: float=0.995,
-            clip_sample_range: float=1.0,
-            sample_max_value: float=1.0, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype="float32")
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype="float32")
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32")
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype="float32", )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype="float32",
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -171,20 +173,16 @@ def __init__(
         # For the final step, there is no previous alphas_cumprod because we are already at 0
         # `set_alpha_to_one` decides whether we set this parameter simply to one or
         # whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = (paddle.to_tensor(1.0) if set_alpha_to_one
-                                    else self.alphas_cumprod[0])
+        self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
 
         # setable values
         self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(
-            np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -200,13 +198,11 @@ def scale_model_input(self,
 
     def _get_variance(self, timestep, prev_timestep):
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
-                             prev_timestep >= 0 else self.final_alpha_cumprod)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
-        variance = (beta_prod_t_prev / beta_prod_t) * (
-            1 - alpha_prod_t / alpha_prod_t_prev)
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
 
         return variance
 
@@ -232,8 +228,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
-        s = paddle.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
         # paddle.clip donot support min > max
         if self.config.sample_max_value < 1:
             s = paddle.ones_like(s) * self.config.sample_max_value
@@ -242,11 +237,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
                 s, min=1, max=self.config.sample_max_value
             )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
 
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = (
-            paddle.clip(sample, -s, s) /
-            s)  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
+        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = paddle.reshape(sample, [batch_size, channels, height, width])
         sample = paddle.cast(sample, dtype)
@@ -266,27 +258,28 @@ def set_timesteps(self, num_inference_steps: int):
             raise ValueError(
                 f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
                 f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                f" maximal {self.config.num_train_timesteps} timesteps.")
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
 
         self.num_inference_steps = num_inference_steps
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
         # creates integer timesteps by multiplying by ratio
         # casting to int to avoid issues when num_inference_step is power of 3
-        timesteps = ((np.arange(0, num_inference_steps) * step_ratio)
-                     .round()[::-1].copy().astype(np.int64))
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = paddle.to_tensor(timesteps)
         self.timesteps += self.config.steps_offset
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            eta: float=0.0,
-            use_clipped_model_output: bool=False,
-            generator=None,
-            variance_noise: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ) -> Union[DDIMSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -330,118 +323,104 @@ def step(
         # - pred_prev_sample -> "x_t-1"
 
         # 1. get previous step value (=t-1)
-        prev_timestep = (timestep - self.config.num_train_timesteps //
-                         self.num_inference_steps)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
 
         # 2. compute alphas, betas
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
-                             prev_timestep >= 0 else self.final_alpha_cumprod)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
 
         beta_prod_t = 1 - alpha_prod_t
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
         if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t**
-                                    (0.5) * model_output) / alpha_prod_t**(0.5)
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
             pred_epsilon = model_output
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
-            pred_epsilon = (sample - alpha_prod_t**
-                            (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
         elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (
-                beta_prod_t**0.5) * model_output
-            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
-                                                                 0.5) * sample
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction`")
+                " `v_prediction`"
+            )
 
         # 4. Clip or threshold "predicted x_0"
         if self.config.thresholding:
             pred_original_sample = self._threshold_sample(pred_original_sample)
         elif self.config.clip_sample:
             pred_original_sample = pred_original_sample.clip(
-                -self.config.clip_sample_range, self.config.clip_sample_range)
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
 
         # 5. compute variance: "sigma_t(η)" -> see formula (16)
         # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
         variance = self._get_variance(timestep, prev_timestep)
-        std_dev_t = eta * variance**(0.5)
+        std_dev_t = eta * variance ** (0.5)
 
         if use_clipped_model_output:
             # the pred_epsilon is always re-derived from the clipped x_0 in Glide
-            pred_epsilon = (sample - alpha_prod_t**
-                            (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
 
         # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**(
-            0.5) * pred_epsilon
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
 
         # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = (alpha_prod_t_prev**
-                       (0.5) * pred_original_sample + pred_sample_direction)
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
 
         if eta > 0:
             if variance_noise is not None and generator is not None:
                 raise ValueError(
                     "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
-                    " `variance_noise` stays `None`.")
+                    " `variance_noise` stays `None`."
+                )
 
             if variance_noise is None:
-                variance_noise = randn_tensor(
-                    model_output.shape,
-                    generator=generator,
-                    dtype=model_output.dtype)
+                variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
             variance = std_dev_t * variance_noise
 
             prev_sample = prev_sample + variance
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
-        return DDIMSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(dtype=original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    def get_velocity(self,
-                     sample: paddle.Tensor,
-                     noise: paddle.Tensor,
-                     timesteps: paddle.Tensor) -> paddle.Tensor:
+    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as sample
         alphas_cumprod = self.alphas_cumprod.cast(dtype=sample.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(sample.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
         while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
index a64c94d782e46..8dfd896087d08 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
@@ -47,8 +47,7 @@ class DDIMSchedulerOutput(BaseOutput):
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -119,45 +118,46 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            clip_sample: bool=True,
-            set_alpha_to_zero: bool=True,
-            steps_offset: int=0,
-            prediction_type: str="epsilon",
-            clip_sample_range: float=1.0,
-            **kwargs, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_zero: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        **kwargs,
+    ):
         if kwargs.get("set_alpha_to_one", None) is not None:
-            deprecation_message = "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
-            deprecate(
-                "set_alpha_to_one",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False)
+            deprecation_message = (
+                "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
+            )
+            deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False)
             set_alpha_to_zero = kwargs["set_alpha_to_one"]
 
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype="float32")
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype="float32")
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32")
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype="float32", )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype="float32",
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -167,20 +167,16 @@ def __init__(
         # `set_alpha_to_zero` decides whether we set this parameter simply to zero
         # in this case, self.step() just output the predicted noise
         # or whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = (paddle.to_tensor(0.0) if set_alpha_to_zero
-                                    else self.alphas_cumprod[-1])
+        self.final_alpha_cumprod = paddle.to_tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1]
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
 
         # setable values
         self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(
-            np.arange(0, num_train_timesteps).copy().astype(np.int64))
+        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps).copy().astype(np.int64))
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -207,75 +203,73 @@ def set_timesteps(self, num_inference_steps: int):
             raise ValueError(
                 f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
                 f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                f" maximal {self.config.num_train_timesteps} timesteps.")
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
 
         self.num_inference_steps = num_inference_steps
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
         # creates integer timesteps by multiplying by ratio
         # casting to int to avoid issues when num_inference_step is power of 3
-        timesteps = ((np.arange(0, num_inference_steps) * step_ratio).round()
-                     .copy().astype(np.int64))
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
         self.timesteps = paddle.to_tensor(timesteps)
         self.timesteps += self.config.steps_offset
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            eta: float=0.0,
-            use_clipped_model_output: bool=False,
-            variance_noise: Optional[paddle.Tensor]=None,
-            return_dict: bool=True, ) -> Union[DDIMSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        variance_noise: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
         # 1. get previous step value (=t+1)
-        prev_timestep = (timestep + self.config.num_train_timesteps //
-                         self.num_inference_steps)
+        prev_timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
 
         # 2. compute alphas, betas
         # change original implementation to exactly match noise levels for analogous forward process
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep]
-                             if prev_timestep < self.config.num_train_timesteps
-                             else self.final_alpha_cumprod)
+        alpha_prod_t_prev = (
+            self.alphas_cumprod[prev_timestep]
+            if prev_timestep < self.config.num_train_timesteps
+            else self.final_alpha_cumprod
+        )
 
         beta_prod_t = 1 - alpha_prod_t
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
         if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t**
-                                    (0.5) * model_output) / alpha_prod_t**(0.5)
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
             pred_epsilon = model_output
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
-            pred_epsilon = (sample - alpha_prod_t**
-                            (0.5) * pred_original_sample) / beta_prod_t**(0.5)
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
         elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (
-                beta_prod_t**0.5) * model_output
-            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
-                                                                 0.5) * sample
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction`")
+                " `v_prediction`"
+            )
 
         # 4. Clip or threshold "predicted x_0"
         if self.config.clip_sample:
             pred_original_sample = pred_original_sample.clip(
-                -self.config.clip_sample_range, self.config.clip_sample_range)
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
 
         # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev)**(0.5) * pred_epsilon
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
 
         # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = (alpha_prod_t_prev**
-                       (0.5) * pred_original_sample + pred_sample_direction)
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
 
         if not return_dict:
             return (prev_sample, pred_original_sample)
-        return DDIMSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
 
     def __len__(self):
         return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
index a3917f57615f8..167ae05b5b169 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
@@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -123,31 +123,35 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            variance_type: str="fixed_small",
-            clip_sample: bool=True,
-            prediction_type: str="epsilon",
-            thresholding: bool=False,
-            dynamic_thresholding_ratio: float=0.995,
-            clip_sample_range: float=1.0,
-            sample_max_value: float=1.0, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -156,8 +160,7 @@ def __init__(
             betas = paddle.linspace(-6, 6, num_train_timesteps)
             self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -169,14 +172,11 @@ def __init__(
         # setable values
         self.custom_timesteps = False
         self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(
-            np.arange(0, num_train_timesteps)[::-1].copy())
+        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
 
         self.variance_type = variance_type
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -191,9 +191,10 @@ def scale_model_input(self,
         return sample
 
     def set_timesteps(
-            self,
-            num_inference_steps: Optional[int]=None,
-            timesteps: Optional[List[int]]=None, ):
+        self,
+        num_inference_steps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -207,20 +208,18 @@ def set_timesteps(
                 must be `None`.
         """
         if num_inference_steps is not None and timesteps is not None:
-            raise ValueError(
-                "Can only pass one of `num_inference_steps` or `custom_timesteps`."
-            )
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
 
         if timesteps is not None:
             for i in range(1, len(timesteps)):
                 if timesteps[i] >= timesteps[i - 1]:
-                    raise ValueError(
-                        "`custom_timesteps` must be in descending order.")
+                    raise ValueError("`custom_timesteps` must be in descending order.")
 
             if timesteps[0] >= self.config.num_train_timesteps:
                 raise ValueError(
                     f"`timesteps` must start before `self.config.train_timesteps`:"
-                    f" {self.config.num_train_timesteps}.")
+                    f" {self.config.num_train_timesteps}."
+                )
 
             timesteps = np.array(timesteps, dtype=np.int64)
             self.custom_timesteps = True
@@ -229,11 +228,11 @@ def set_timesteps(
                 raise ValueError(
                     f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
                     f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                    f" maximal {self.config.num_train_timesteps} timesteps.")
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
             self.num_inference_steps = num_inference_steps
             step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-            timesteps = ((np.arange(0, num_inference_steps) * step_ratio)
-                         .round()[::-1].copy().astype(np.int64))
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
             self.custom_timesteps = False
 
         self.timesteps = paddle.to_tensor(timesteps)
@@ -242,8 +241,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None):
         prev_t = self.previous_timestep(t)
 
         alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[
-            prev_t] if prev_t >= 0 else self.one
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
         current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
 
         # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
@@ -301,8 +299,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
-        s = paddle.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
         # paddle.clip donot support min > max
         if self.config.sample_max_value < 1:
             s = paddle.ones_like(s) * self.config.sample_max_value
@@ -310,11 +307,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
             s = paddle.clip(
                 s, min=1, max=self.config.sample_max_value
             )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = (
-            paddle.clip(sample, -s, s) /
-            s)  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
+        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = paddle.reshape(sample, [batch_size, channels, height, width])
         sample = paddle.cast(sample, dtype)
@@ -322,12 +316,13 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
         return sample
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            generator=None,
-            return_dict: bool=True, ) -> Union[DDPMSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -349,19 +344,17 @@ def step(
         t = timestep
         prev_t = self.previous_timestep(t)
 
-        if model_output.shape[1] == sample.shape[
-                1] * 2 and self.variance_type in [
-                    "learned",
-                    "learned_range",
-                ]:
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in [
+            "learned",
+            "learned_range",
+        ]:
             model_output, predicted_variance = model_output.chunk(2, axis=1)
         else:
             predicted_variance = None
 
         # 1. compute alphas, betas
         alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[
-            prev_t] if prev_t >= 0 else self.one
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
         current_alpha_t = alpha_prod_t / alpha_prod_t_prev
@@ -370,17 +363,16 @@ def step(
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
         if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t**
-                                    (0.5) * model_output) / alpha_prod_t**(0.5)
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
         elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (
-                beta_prod_t**0.5) * model_output
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
-                " `v_prediction`  for the DDPMScheduler.")
+                " `v_prediction`  for the DDPMScheduler."
+            )
 
         # 3. Clip or threshold "predicted x_0"
         if self.config.thresholding:
@@ -389,84 +381,69 @@ def step(
             pred_original_sample = paddle.clip(
                 pred_original_sample,
                 -self.config.clip_sample_range,
-                self.config.clip_sample_range, )
+                self.config.clip_sample_range,
+            )
 
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample_coeff = (alpha_prod_t_prev
-                                      **(0.5) * current_beta_t) / beta_prod_t
-        current_sample_coeff = current_alpha_t**(
-            0.5) * beta_prod_t_prev / beta_prod_t
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
 
         # 5. Compute predicted previous sample µ_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_prev_sample = (pred_original_sample_coeff * pred_original_sample +
-                            current_sample_coeff * sample)
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
 
         # 6. Add noise
         variance = 0
         if t > 0:
-            variance_noise = randn_tensor(
-                model_output.shape,
-                generator=generator,
-                dtype=model_output.dtype)
+            variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
             if self.variance_type == "fixed_small_log":
-                variance = (self._get_variance(
-                    t, predicted_variance=predicted_variance) * variance_noise)
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
             elif self.variance_type == "learned_range":
-                variance = self._get_variance(
-                    t, predicted_variance=predicted_variance)
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
                 variance = paddle.exp(0.5 * variance) * variance_noise
             else:
-                variance = (self._get_variance(
-                    t, predicted_variance=predicted_variance)
-                            **0.5) * variance_noise
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
 
         pred_prev_sample = pred_prev_sample + variance
 
         if not return_dict:
-            return (pred_prev_sample, )
+            return (pred_prev_sample,)
 
-        return DDPMSchedulerOutput(
-            prev_sample=pred_prev_sample,
-            pred_original_sample=pred_original_sample)
+        return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    def get_velocity(self,
-                     sample: paddle.Tensor,
-                     noise: paddle.Tensor,
-                     timesteps: paddle.Tensor) -> paddle.Tensor:
+    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(sample.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(sample.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
         while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
@@ -485,9 +462,9 @@ def previous_timestep(self, timestep):
             else:
                 prev_t = self.timesteps[index + 1]
         else:
-            num_inference_steps = (self.num_inference_steps
-                                   if self.num_inference_steps else
-                                   self.config.num_train_timesteps)
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
             prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
 
         return prev_t
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
index 845b209a9bc2d..7d4b5802fb447 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
@@ -23,8 +23,7 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -47,7 +46,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -113,38 +112,41 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[np.ndarray]=None,
-            solver_order: int=2,
-            prediction_type: str="epsilon",
-            thresholding: bool=False,
-            dynamic_thresholding_ratio: float=0.995,
-            sample_max_value: float=1.0,
-            algorithm_type: str="deis",
-            solver_type: str="logrho",
-            lower_order_final: bool=True, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "deis",
+        solver_type: str = "logrho",
+        lower_order_final: bool = True,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -161,23 +163,17 @@ def __init__(
             if algorithm_type in ["dpmsolver", "dpmsolver++"]:
                 self.register_to_config(algorithm_type="deis")
             else:
-                raise NotImplementedError(
-                    f"{algorithm_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
 
         if solver_type not in ["logrho"]:
             if solver_type in ["midpoint", "heun", "bh1", "bh2"]:
                 self.register_to_config(solver_type="logrho")
             else:
-                raise NotImplementedError(
-                    f"solver type {solver_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"solver type {solver_type} does is not implemented for {self.__class__}")
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=np.float32)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps)
         self.model_outputs = [None] * solver_order
         self.lower_order_nums = 0
@@ -190,9 +186,12 @@ def set_timesteps(self, num_inference_steps: int):
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
-        timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
-                                 num_inference_steps + 1).round()[::-1][:-1]
-                     .copy().astype(np.int64))
+        timesteps = (
+            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int64)
+        )
 
         # when num_inference_steps == num_train_timesteps, we can end up with
         # duplicates in timesteps.
@@ -203,7 +202,9 @@ def set_timesteps(self, num_inference_steps: int):
 
         self.num_inference_steps = len(timesteps)
 
-        self.model_outputs = [None, ] * self.config.solver_order
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
         self.lower_order_nums = 0
 
     def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
@@ -228,8 +229,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
-        s = paddle.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
         # paddle.clip donot support min > max
         if self.config.sample_max_value < 1:
             s = paddle.ones_like(s) * self.config.sample_max_value
@@ -237,21 +237,15 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
             s = paddle.clip(
                 s, min=1, max=self.config.sample_max_value
             )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = (
-            paddle.clip(sample, -s, s) /
-            s)  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
+        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = paddle.reshape(sample, [batch_size, channels, height, width])
         sample = paddle.cast(sample, dtype)
 
         return sample
 
-    def convert_model_output(self,
-                             model_output: paddle.Tensor,
-                             timestep: int,
-                             sample: paddle.Tensor) -> paddle.Tensor:
+    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
         """
         Convert the model output to the corresponding type that the algorithm DEIS needs.
 
@@ -275,7 +269,8 @@ def convert_model_output(self,
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction` for the DEISMultistepScheduler.")
+                " `v_prediction` for the DEISMultistepScheduler."
+            )
 
         if self.config.thresholding:
             x0_pred = self._threshold_sample(x0_pred)
@@ -287,11 +282,12 @@ def convert_model_output(self,
             raise NotImplementedError("only support log-rho multistep deis now")
 
     def deis_first_order_update(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the first-order DEIS (equivalent to DDIM).
 
@@ -305,24 +301,23 @@ def deis_first_order_update(
         Returns:
             `paddle.Tensor`: the sample tensor at the previous timestep.
         """
-        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[
-            timestep]
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
         alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
         sigma_t, _ = self.sigma_t[prev_timestep], self.sigma_t[timestep]
         h = lambda_t - lambda_s
         if self.config.algorithm_type == "deis":
-            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0
-                                                             )) * model_output
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
         else:
             raise NotImplementedError("only support log-rho multistep deis now")
         return x_t
 
     def multistep_deis_second_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the second-order multistep DEIS.
 
@@ -342,28 +337,28 @@ def multistep_deis_second_order_update(
         alpha_t, alpha_s0, alpha_s1 = (
             self.alpha_t[t],
             self.alpha_t[s0],
-            self.alpha_t[s1], )
+            self.alpha_t[s1],
+        )
         sigma_t, sigma_s0, sigma_s1 = (
             self.sigma_t[t],
             self.sigma_t[s0],
-            self.sigma_t[s1], )
+            self.sigma_t[s1],
+        )
 
         rho_t, rho_s0, rho_s1 = (
             sigma_t / alpha_t,
             sigma_s0 / alpha_s0,
-            sigma_s1 / alpha_s1, )
+            sigma_s1 / alpha_s1,
+        )
 
         if self.config.algorithm_type == "deis":
 
             def ind_fn(t, b, c):
                 # Integrate[(log(t) - log(c)) / (log(b) - log(c)), {t}]
-                return (t * (-paddle.log(c) + paddle.log(t) - 1) /
-                        (paddle.log(b) - paddle.log(c)))
+                return t * (-paddle.log(c) + paddle.log(t) - 1) / (paddle.log(b) - paddle.log(c))
 
-            coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0,
-                                                           rho_s1)
-            coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1,
-                                                           rho_s0)
+            coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, rho_s1)
+            coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s0)
 
             x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1)
             return x_t
@@ -371,11 +366,12 @@ def ind_fn(t, b, c):
             raise NotImplementedError("only support log-rho multistep deis now")
 
     def multistep_deis_third_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the third-order multistep DEIS.
 
@@ -394,57 +390,60 @@ def multistep_deis_third_order_update(
             prev_timestep,
             timestep_list[-1],
             timestep_list[-2],
-            timestep_list[-3], )
-        m0, m1, m2 = model_output_list[-1], model_output_list[
-            -2], model_output_list[-3]
+            timestep_list[-3],
+        )
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
         alpha_t, alpha_s0, alpha_s1, alpha_s2 = (
             self.alpha_t[t],
             self.alpha_t[s0],
             self.alpha_t[s1],
-            self.alpha_t[s2], )
+            self.alpha_t[s2],
+        )
         sigma_t, sigma_s0, sigma_s1, simga_s2 = (
             self.sigma_t[t],
             self.sigma_t[s0],
             self.sigma_t[s1],
-            self.sigma_t[s2], )
+            self.sigma_t[s2],
+        )
         rho_t, rho_s0, rho_s1, rho_s2 = (
             sigma_t / alpha_t,
             sigma_s0 / alpha_s0,
             sigma_s1 / alpha_s1,
-            simga_s2 / alpha_s2, )
+            simga_s2 / alpha_s2,
+        )
 
         if self.config.algorithm_type == "deis":
 
             def ind_fn(t, b, c, d):
                 # Integrate[(log(t) - log(c))(log(t) - log(d)) / (log(b) - log(c))(log(b) - log(d)), {t}]
                 numerator = t * (
-                    paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1
-                                     ) - paddle.log(d) * paddle.log(t) +
-                    paddle.log(d) + paddle.log(t)**2 - 2 * paddle.log(t) + 2)
-                denominator = (paddle.log(b) - paddle.log(c)) * (
-                    paddle.log(b) - paddle.log(d))
+                    paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1)
+                    - paddle.log(d) * paddle.log(t)
+                    + paddle.log(d)
+                    + paddle.log(t) ** 2
+                    - 2 * paddle.log(t)
+                    + 2
+                )
+                denominator = (paddle.log(b) - paddle.log(c)) * (paddle.log(b) - paddle.log(d))
                 return numerator / denominator
 
-            coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(
-                rho_s0, rho_s0, rho_s1, rho_s2)
-            coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(
-                rho_s0, rho_s1, rho_s2, rho_s0)
-            coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(
-                rho_s0, rho_s2, rho_s0, rho_s1)
+            coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(rho_s0, rho_s0, rho_s1, rho_s2)
+            coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s2, rho_s0)
+            coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s2, rho_s0, rho_s1)
 
-            x_t = alpha_t * (
-                sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
+            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
 
             return x_t
         else:
             raise NotImplementedError("only support log-rho multistep deis now")
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the multistep DEIS.
 
@@ -470,29 +469,26 @@ def step(
             step_index = len(self.timesteps) - 1
         else:
             step_index = step_index.item()
-        prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
-                         self.timesteps[step_index + 1])
-        lower_order_final = ((step_index == len(self.timesteps) - 1) and
-                             self.config.lower_order_final and
-                             len(self.timesteps) < 15)
-        lower_order_second = ((step_index == len(self.timesteps) - 2) and
-                              self.config.lower_order_final and
-                              len(self.timesteps) < 15)
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        lower_order_final = (
+            (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
 
         model_output = self.convert_model_output(model_output, timestep, sample)
         for i in range(self.config.solver_order - 1):
             self.model_outputs[i] = self.model_outputs[i + 1]
         self.model_outputs[-1] = model_output
 
-        if (self.config.solver_order == 1 or self.lower_order_nums < 1 or
-                lower_order_final):
-            prev_sample = self.deis_first_order_update(model_output, timestep,
-                                                       prev_timestep, sample)
-        elif (self.config.solver_order == 2 or self.lower_order_nums < 2 or
-              lower_order_second):
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.deis_first_order_update(model_output, timestep, prev_timestep, sample)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
             timestep_list = [self.timesteps[step_index - 1], timestep]
             prev_sample = self.multistep_deis_second_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample)
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
         else:
             timestep_list = [
                 self.timesteps[step_index - 2],
@@ -500,18 +496,18 @@ def step(
                 timestep,
             ]
             prev_sample = self.multistep_deis_third_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample)
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
 
         if self.lower_order_nums < self.config.solver_order:
             self.lower_order_nums += 1
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -525,26 +521,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
         return sample
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
     def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
index 9b360646172d5..5ebc674044afa 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -22,8 +22,7 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -127,39 +126,42 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            solver_order: int=2,
-            prediction_type: str="epsilon",
-            thresholding: bool=False,
-            dynamic_thresholding_ratio: float=0.995,
-            sample_max_value: float=1.0,
-            algorithm_type: str="dpmsolver++",
-            solver_type: str="midpoint",
-            lower_order_final: bool=True,
-            use_karras_sigmas: Optional[bool]=False, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -176,23 +178,17 @@ def __init__(
             if algorithm_type == "deis":
                 self.register_to_config(algorithm_type="dpmsolver++")
             else:
-                raise NotImplementedError(
-                    f"{algorithm_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
 
         if solver_type not in ["midpoint", "heun"]:
             if solver_type in ["logrho", "bh1", "bh2"]:
                 self.register_to_config(solver_type="midpoint")
             else:
-                raise NotImplementedError(
-                    f"{solver_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=np.float32)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps)
         self.model_outputs = [None] * solver_order
         self.lower_order_nums = 0
@@ -206,18 +202,17 @@ def set_timesteps(self, num_inference_steps: int):
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
-        timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
-                                 num_inference_steps + 1).round()[::-1][:-1]
-                     .copy().astype(np.int64))
+        timesteps = (
+            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int64)
+        )
         if self.use_karras_sigmas:
-            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)
-                              **0.5)
+            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
             log_sigmas = np.log(sigmas)
-            sigmas = self._convert_to_karras(
-                in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-            timesteps = np.array(
-                [self._sigma_to_t(sigma, log_sigmas)
-                 for sigma in sigmas]).round()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
             timesteps = np.flip(timesteps).copy().astype(np.int64)
 
         # when num_inference_steps == num_train_timesteps, we can end up with
@@ -229,7 +224,9 @@ def set_timesteps(self, num_inference_steps: int):
 
         self.num_inference_steps = len(timesteps)
 
-        self.model_outputs = [None, ] * self.config.solver_order
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
         self.lower_order_nums = 0
 
     def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
@@ -254,8 +251,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
-        s = paddle.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
         # paddle.clip donot support min > max
         if self.config.sample_max_value < 1:
             s = paddle.ones_like(s) * self.config.sample_max_value
@@ -263,11 +259,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
             s = paddle.clip(
                 s, min=1, max=self.config.sample_max_value
             )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = (
-            paddle.clip(sample, -s, s) /
-            s)  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
+        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = paddle.reshape(sample, [batch_size, channels, height, width])
         sample = paddle.cast(sample, dtype)
@@ -282,9 +275,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         dists = log_sigma - log_sigmas[:, np.newaxis]
 
         # get sigmas range
-        low_idx = (np.cumsum(
-            (dists >= 0), axis=0).argmax(axis=0)
-                   .clip(max=log_sigmas.shape[0] - 2))
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
         high_idx = low_idx + 1
 
         low = log_sigmas[low_idx]
@@ -299,8 +290,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    def _convert_to_karras(self, in_sigmas: paddle.Tensor,
-                           num_inference_steps) -> paddle.Tensor:
+    def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
         sigma_min = in_sigmas[-1].item()
@@ -308,15 +298,12 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor,
 
         rho = 7.0  # 7.0 is the value used in the paper
         ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min**(1 / rho)
-        max_inv_rho = sigma_max**(1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
         return sigmas
 
-    def convert_model_output(self,
-                             model_output: paddle.Tensor,
-                             timestep: int,
-                             sample: paddle.Tensor) -> paddle.Tensor:
+    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
         """
         Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
 
@@ -339,19 +326,18 @@ def convert_model_output(self,
         # DPM-Solver++ needs to solve an integral of the data prediction model.
         if self.config.algorithm_type == "dpmsolver++":
             if self.config.prediction_type == "epsilon":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
             elif self.config.prediction_type == "sample":
                 x0_pred = model_output
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = alpha_t * sample - sigma_t * model_output
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverMultistepScheduler.")
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
 
             if self.config.thresholding:
                 x0_pred = self._threshold_sample(x0_pred)
@@ -362,26 +348,26 @@ def convert_model_output(self,
             if self.config.prediction_type == "epsilon":
                 return model_output
             elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = alpha_t * model_output + sigma_t * sample
                 return epsilon
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverMultistepScheduler.")
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
 
     def dpm_solver_first_order_update(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the first-order DPM-Solver (equivalent to DDIM).
 
@@ -397,25 +383,23 @@ def dpm_solver_first_order_update(
         Returns:
             `paddle.Tensor`: the sample tensor at the previous timestep.
         """
-        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[
-            timestep]
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
         alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
         sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
         h = lambda_t - lambda_s
         if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (
-                paddle.exp(-h) - 1.0)) * model_output
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
         elif self.config.algorithm_type == "dpmsolver":
-            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0
-                                                             )) * model_output
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
         return x_t
 
     def multistep_dpm_solver_second_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the second-order multistep DPM-Solver.
 
@@ -435,7 +419,8 @@ def multistep_dpm_solver_second_order_update(
         lambda_t, lambda_s0, lambda_s1 = (
             self.lambda_t[t],
             self.lambda_t[s0],
-            self.lambda_t[s1], )
+            self.lambda_t[s1],
+        )
         alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
         sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
         h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
@@ -444,31 +429,40 @@ def multistep_dpm_solver_second_order_update(
         if self.config.algorithm_type == "dpmsolver++":
             # See https://arxiv.org/abs/2211.01095 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((sigma_t / sigma_s0) * sample -
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 *
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D1)
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
+                )
             elif self.config.solver_type == "heun":
-                x_t = ((sigma_t / sigma_s0) * sample -
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
-                           (paddle.exp(-h) - 1.0) / h + 1.0)) * D1)
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
         elif self.config.algorithm_type == "dpmsolver":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((alpha_t / alpha_s0) * sample -
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D0 - 0.5 *
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D1)
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1
+                )
             elif self.config.solver_type == "heun":
-                x_t = ((alpha_t / alpha_s0) * sample -
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
-                           (paddle.exp(h) - 1.0) / h - 1.0)) * D1)
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+                )
         return x_t
 
     def multistep_dpm_solver_third_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the third-order multistep DPM-Solver.
 
@@ -487,14 +481,15 @@ def multistep_dpm_solver_third_order_update(
             prev_timestep,
             timestep_list[-1],
             timestep_list[-2],
-            timestep_list[-3], )
-        m0, m1, m2 = model_output_list[-1], model_output_list[
-            -2], model_output_list[-3]
+            timestep_list[-3],
+        )
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
         lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
             self.lambda_t[t],
             self.lambda_t[s0],
             self.lambda_t[s1],
-            self.lambda_t[s2], )
+            self.lambda_t[s2],
+        )
         alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
         sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
         h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
@@ -505,24 +500,29 @@ def multistep_dpm_solver_third_order_update(
         D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
         if self.config.algorithm_type == "dpmsolver++":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = ((sigma_t / sigma_s0) * sample -
-                   (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
-                       (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * (
-                           (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
         elif self.config.algorithm_type == "dpmsolver":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = ((alpha_t / alpha_s0) * sample -
-                   (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
-                       (paddle.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * (
-                           (paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
         return x_t
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the multistep DPM-Solver.
 
@@ -548,29 +548,26 @@ def step(
             step_index = len(self.timesteps) - 1
         else:
             step_index = step_index.item()
-        prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
-                         self.timesteps[step_index + 1])
-        lower_order_final = ((step_index == len(self.timesteps) - 1) and
-                             self.config.lower_order_final and
-                             len(self.timesteps) < 15)
-        lower_order_second = ((step_index == len(self.timesteps) - 2) and
-                              self.config.lower_order_final and
-                              len(self.timesteps) < 15)
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        lower_order_final = (
+            (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
 
         model_output = self.convert_model_output(model_output, timestep, sample)
         for i in range(self.config.solver_order - 1):
             self.model_outputs[i] = self.model_outputs[i + 1]
         self.model_outputs[-1] = model_output
 
-        if (self.config.solver_order == 1 or self.lower_order_nums < 1 or
-                lower_order_final):
-            prev_sample = self.dpm_solver_first_order_update(
-                model_output, timestep, prev_timestep, sample)
-        elif (self.config.solver_order == 2 or self.lower_order_nums < 2 or
-              lower_order_second):
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
             timestep_list = [self.timesteps[step_index - 1], timestep]
             prev_sample = self.multistep_dpm_solver_second_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample)
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
         else:
             timestep_list = [
                 self.timesteps[step_index - 2],
@@ -578,18 +575,18 @@ def step(
                 timestep,
             ]
             prev_sample = self.multistep_dpm_solver_third_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample)
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
 
         if self.lower_order_nums < self.config.solver_order:
             self.lower_order_nums += 1
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -603,26 +600,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
         return sample
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
     def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 499d2e90373b9..0e99f01aa230b 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -22,8 +22,7 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -123,38 +122,41 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[np.ndarray]=None,
-            solver_order: int=2,
-            prediction_type: str="epsilon",
-            thresholding: bool=False,
-            dynamic_thresholding_ratio: float=0.995,
-            sample_max_value: float=1.0,
-            algorithm_type: str="dpmsolver++",
-            solver_type: str="midpoint",
-            lower_order_final: bool=True, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -171,22 +173,16 @@ def __init__(
             if algorithm_type == "deis":
                 self.register_to_config(algorithm_type="dpmsolver++")
             else:
-                raise NotImplementedError(
-                    f"{algorithm_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
         if solver_type not in ["midpoint", "heun"]:
             if solver_type in ["logrho", "bh1", "bh2"]:
                 self.register_to_config(solver_type="midpoint")
             else:
-                raise NotImplementedError(
-                    f"{solver_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=np.float32)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps)
         self.model_outputs = [None] * solver_order
         self.sample = None
@@ -248,8 +244,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
-        s = paddle.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
         # paddle.clip donot support min > max
         if self.config.sample_max_value < 1:
             s = paddle.ones_like(s) * self.config.sample_max_value
@@ -257,11 +252,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
             s = paddle.clip(
                 s, min=1, max=self.config.sample_max_value
             )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = (
-            paddle.clip(sample, -s, s) /
-            s)  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
+        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = paddle.reshape(sample, [batch_size, channels, height, width])
         sample = paddle.cast(sample, dtype)
@@ -277,18 +269,18 @@ def set_timesteps(self, num_inference_steps: int):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
         self.num_inference_steps = num_inference_steps
-        timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
-                                 num_inference_steps + 1).round()[::-1][:-1]
-                     .copy().astype(np.int64))
+        timesteps = (
+            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int64)
+        )
         self.timesteps = paddle.to_tensor(timesteps)
         self.model_outputs = [None] * self.config.solver_order
         self.sample = None
         self.orders = self.get_order_list(num_inference_steps)
 
-    def convert_model_output(self,
-                             model_output: paddle.Tensor,
-                             timestep: int,
-                             sample: paddle.Tensor) -> paddle.Tensor:
+    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
         """
         Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
 
@@ -311,19 +303,18 @@ def convert_model_output(self,
         # DPM-Solver++ needs to solve an integral of the data prediction model.
         if self.config.algorithm_type == "dpmsolver++":
             if self.config.prediction_type == "epsilon":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
             elif self.config.prediction_type == "sample":
                 x0_pred = model_output
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = alpha_t * sample - sigma_t * model_output
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverSinglestepScheduler.")
+                    " `v_prediction` for the DPMSolverSinglestepScheduler."
+                )
 
             if self.config.thresholding:
                 x0_pred = self._threshold_sample(x0_pred)
@@ -334,26 +325,26 @@ def convert_model_output(self,
             if self.config.prediction_type == "epsilon":
                 return model_output
             elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = alpha_t * model_output + sigma_t * sample
                 return epsilon
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverSinglestepScheduler.")
+                    " `v_prediction` for the DPMSolverSinglestepScheduler."
+                )
 
     def dpm_solver_first_order_update(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the first-order DPM-Solver (equivalent to DDIM).
 
@@ -369,25 +360,23 @@ def dpm_solver_first_order_update(
         Returns:
             `paddle.Tensor`: the sample tensor at the previous timestep.
         """
-        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[
-            timestep]
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
         alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
         sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
         h = lambda_t - lambda_s
         if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (
-                paddle.exp(-h) - 1.0)) * model_output
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
         elif self.config.algorithm_type == "dpmsolver":
-            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0
-                                                             )) * model_output
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
         return x_t
 
     def singlestep_dpm_solver_second_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the second-order singlestep DPM-Solver.
 
@@ -409,7 +398,8 @@ def singlestep_dpm_solver_second_order_update(
         lambda_t, lambda_s0, lambda_s1 = (
             self.lambda_t[t],
             self.lambda_t[s0],
-            self.lambda_t[s1], )
+            self.lambda_t[s1],
+        )
         alpha_t, alpha_s1 = self.alpha_t[t], self.alpha_t[s1]
         sigma_t, sigma_s1 = self.sigma_t[t], self.sigma_t[s1]
         h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1
@@ -418,31 +408,40 @@ def singlestep_dpm_solver_second_order_update(
         if self.config.algorithm_type == "dpmsolver++":
             # See https://arxiv.org/abs/2211.01095 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((sigma_t / sigma_s1) * sample -
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 *
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D1)
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
+                )
             elif self.config.solver_type == "heun":
-                x_t = ((sigma_t / sigma_s1) * sample -
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
-                           (paddle.exp(-h) - 1.0) / h + 1.0)) * D1)
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
         elif self.config.algorithm_type == "dpmsolver":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((alpha_t / alpha_s1) * sample -
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D0 - 0.5 *
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D1)
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1
+                )
             elif self.config.solver_type == "heun":
-                x_t = ((alpha_t / alpha_s1) * sample -
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
-                           (paddle.exp(h) - 1.0) / h - 1.0)) * D1)
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+                )
         return x_t
 
     def singlestep_dpm_solver_third_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the third-order singlestep DPM-Solver.
 
@@ -463,14 +462,15 @@ def singlestep_dpm_solver_third_order_update(
             prev_timestep,
             timestep_list[-1],
             timestep_list[-2],
-            timestep_list[-3], )
-        m0, m1, m2 = model_output_list[-1], model_output_list[
-            -2], model_output_list[-3]
+            timestep_list[-3],
+        )
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
         lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
             self.lambda_t[t],
             self.lambda_t[s0],
             self.lambda_t[s1],
-            self.lambda_t[s2], )
+            self.lambda_t[s2],
+        )
         alpha_t, alpha_s2 = self.alpha_t[t], self.alpha_t[s2]
         sigma_t, sigma_s2 = self.sigma_t[t], self.sigma_t[s2]
         h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2
@@ -482,35 +482,43 @@ def singlestep_dpm_solver_third_order_update(
         if self.config.algorithm_type == "dpmsolver++":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((sigma_t / sigma_s2) * sample -
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
-                           (paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1)
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1
+                )
             elif self.config.solver_type == "heun":
                 x_t = (
-                    (sigma_t / sigma_s2) * sample -
-                    (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
-                        (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * (
-                            (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+                    - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+                )
         elif self.config.algorithm_type == "dpmsolver":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((alpha_t / alpha_s2) * sample -
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
-                           (paddle.exp(h) - 1.0) / h - 1.0)) * D1_1)
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1_1
+                )
             elif self.config.solver_type == "heun":
-                x_t = ((alpha_t / alpha_s2) * sample -
-                       (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * (
-                           (paddle.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * (
-                               (paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
+                    - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+                )
         return x_t
 
     def singlestep_dpm_solver_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor,
-            order: int, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+        order: int,
+    ) -> paddle.Tensor:
         """
         One step for the singlestep DPM-Solver.
 
@@ -528,23 +536,25 @@ def singlestep_dpm_solver_update(
             `paddle.Tensor`: the sample tensor at the previous timestep.
         """
         if order == 1:
-            return self.dpm_solver_first_order_update(
-                model_output_list[-1], timestep_list[-1], prev_timestep, sample)
+            return self.dpm_solver_first_order_update(model_output_list[-1], timestep_list[-1], prev_timestep, sample)
         elif order == 2:
             return self.singlestep_dpm_solver_second_order_update(
-                model_output_list, timestep_list, prev_timestep, sample)
+                model_output_list, timestep_list, prev_timestep, sample
+            )
         elif order == 3:
             return self.singlestep_dpm_solver_third_order_update(
-                model_output_list, timestep_list, prev_timestep, sample)
+                model_output_list, timestep_list, prev_timestep, sample
+            )
         else:
             raise ValueError(f"Order must be 1, 2, 3, got {order}")
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the singlestep DPM-Solver.
 
@@ -570,8 +580,7 @@ def step(
             step_index = len(self.timesteps) - 1
         else:
             step_index = step_index.item()
-        prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
-                         self.timesteps[step_index + 1])
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
 
         model_output = self.convert_model_output(model_output, timestep, sample)
         for i in range(self.config.solver_order - 1):
@@ -583,20 +592,17 @@ def step(
         if order == 1:
             self.sample = sample
 
-        timestep_list = [
-            self.timesteps[step_index - i] for i in range(order - 1, 0, -1)
-        ] + [timestep]
+        timestep_list = [self.timesteps[step_index - i] for i in range(order - 1, 0, -1)] + [timestep]
         prev_sample = self.singlestep_dpm_solver_update(
-            self.model_outputs, timestep_list, prev_timestep, self.sample,
-            order)
+            self.model_outputs, timestep_list, prev_timestep, self.sample, order
+        )
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -610,26 +616,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
         return sample
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
     def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
index 82931a90d6eff..eccdbb7bfdcf4 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
@@ -18,17 +18,14 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 def logaddexp(x, y):
-    return paddle.log(1 + paddle.exp(
-        paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y)
+    return paddle.log(1 + paddle.exp(paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y)
 
 
-def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor,
-                   yp: paddle.Tensor) -> paddle.Tensor:
+def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, yp: paddle.Tensor) -> paddle.Tensor:
     """Performs piecewise linear interpolation for x, using xp and yp keypoints (knots).
     Performs separate interpolation for each channel.
     Args:
@@ -45,8 +42,7 @@ def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor,
     >>> calibrate1d(paddle.to_tensor([[-10]]), paddle.to_tensor([[0.0, 1.0]]), paddle.to_tensor([[0.0, 2.0]]))
     tensor([[-20.0000]])
     """
-    x_breakpoints = paddle.concat(
-        [x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2)
+    x_breakpoints = paddle.concat([x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2)
     num_x_points = xp.shape[1]
     sorted_x_breakpoints = paddle.sort(x_breakpoints, axis=2)
     x_indices = paddle.argsort(x_breakpoints, axis=2)
@@ -58,29 +54,26 @@ def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor,
         paddle.where(
             paddle.equal(x_idx, num_x_points),
             paddle.to_tensor([num_x_points - 2]),
-            cand_start_idx, ), )
-    end_idx = paddle.where(
-        paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
-    start_x = paddle.take_along_axis(
-        arr=sorted_x_breakpoints, axis=2,
-        indices=start_idx.unsqueeze(axis=2)).squeeze(axis=2)
-    end_x = paddle.take_along_axis(
-        arr=sorted_x_breakpoints, axis=2,
-        indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2)
+            cand_start_idx,
+        ),
+    )
+    end_idx = paddle.where(paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=start_idx.unsqueeze(axis=2)).squeeze(
+        axis=2
+    )
+    end_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2)
     start_idx2 = paddle.where(
         paddle.equal(x_idx, 0),
         paddle.to_tensor([0]),
         paddle.where(
             paddle.equal(x_idx, num_x_points),
             paddle.to_tensor([num_x_points - 2]),
-            cand_start_idx, ), )
+            cand_start_idx,
+        ),
+    )
     y_positions_expanded = yp.unsqueeze(0).expand([x.shape[0], -1, -1])
-    start_y = paddle.take_along_axis(
-        y_positions_expanded, axis=2,
-        indices=start_idx2.unsqueeze(2)).squeeze(2)
-    end_y = paddle.take_along_axis(
-        y_positions_expanded, axis=2,
-        indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    start_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
     cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
     return cand
 
@@ -128,35 +121,38 @@ class DPMSolverUniDiffuserScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.00085,
-            beta_end: float=0.0120,
-            method="multistep",
-            schedule: str="discrete",
-            beta_schedule: str="scaled_linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon",
-            algorithm_type: str="dpmsolver++",
-            solver_type: str="midpoint", ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.0120,
+        method="multistep",
+        schedule: str = "discrete",
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         if beta_schedule == "scaled_linear":
             # this schedule is very specific to the unidiffuser model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         if schedule == "discrete":
             log_alphas = 0.5 * paddle.log(1 - self.betas).cumsum(axis=0)
             self.total_N = len(log_alphas)
-            self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0,
-                                              self.total_N).reshape([1, -1])
+            self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0, self.total_N).reshape([1, -1])
             self.log_alpha_discrete = log_alphas.reshape((1, -1))
         else:
             raise ValueError
@@ -172,16 +168,12 @@ def __init__(
             if algorithm_type == "deis":
                 algorithm_type = "dpmsolver++"
             else:
-                raise NotImplementedError(
-                    f"{algorithm_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
         if solver_type not in ["midpoint"]:
             if solver_type in ["logrho", "bh1", "bh2"]:
                 solver_type = "midpoint"
             else:
-                raise NotImplementedError(
-                    f"{solver_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
@@ -196,7 +188,8 @@ def marginal_log_mean_coeff(self, t):
             return interpolate_fn(
                 t.reshape((-1, 1)),
                 self.t_discrete.clone(),
-                self.log_alpha_discrete.clone(), ).reshape((-1, ))
+                self.log_alpha_discrete.clone(),
+            ).reshape((-1,))
         else:
             raise ValueError
 
@@ -207,8 +200,7 @@ def marginal_std(self, t):
         """
         Compute sigma_t of a given continuous-time label t in [0, T].
         """
-        return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff(
-            t)))
+        return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff(t)))
 
     def marginal_lambda(self, t):
         """
@@ -220,12 +212,13 @@ def marginal_lambda(self, t):
 
     def inverse_lambda(self, lamb):
         if self.schedule == "discrete":
-            log_alpha = -0.5 * logaddexp(paddle.zeros((1, )), -2.0 * lamb)
+            log_alpha = -0.5 * logaddexp(paddle.zeros((1,)), -2.0 * lamb)
             t = interpolate_fn(
                 log_alpha.reshape((-1, 1)),
                 paddle.flip(self.log_alpha_discrete.clone(), [1]),
-                paddle.flip(self.t_discrete.clone(), [1]), )
-            return t.reshape((-1, ))
+                paddle.flip(self.t_discrete.clone(), [1]),
+            )
+            return t.reshape((-1,))
         else:
             raise ValueError
 
@@ -243,10 +236,7 @@ def set_timesteps(self, num_inference_steps: int):
         self.noise_prev_list = []
         self.t_prev_list = []
 
-    def convert_model_output(self,
-                             model_output: paddle.Tensor,
-                             timestep: int,
-                             sample: paddle.Tensor) -> paddle.Tensor:
+    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
         """
         Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
 
@@ -267,17 +257,17 @@ def convert_model_output(self,
             `paddle.Tensor`: the converted model output.
         """
         # DPM-Solver++ needs to solve an integral of the data prediction model.
-        alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std(
-            timestep)
+        alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std(timestep)
         x0_pred = (sample - sigma_t * model_output) / alpha_t
         return x0_pred
 
     def dpm_solver_first_order_update(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the first-order DPM-Solver (equivalent to DDIM).
 
@@ -293,27 +283,25 @@ def dpm_solver_first_order_update(
         Returns:
             `paddle.Tensor`: the sample tensor at the previous timestep.
         """
-        lambda_t, lambda_s = self.marginal_lambda(
-            timestep), self.marginal_lambda(prev_timestep)
+        lambda_t, lambda_s = self.marginal_lambda(timestep), self.marginal_lambda(prev_timestep)
         alpha_t = self.marginal_log_mean_coeff(timestep)
-        sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std(
-            prev_timestep)
+        sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std(prev_timestep)
 
         alpha_t = paddle.exp(alpha_t)
         h = lambda_t - lambda_s
         if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (
-                paddle.exp(-h) - 1.0)) * model_output
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
         else:
             raise ValueError
         return x_t
 
     def multistep_dpm_solver_second_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the second-order multistep DPM-Solver.
 
@@ -333,7 +321,8 @@ def multistep_dpm_solver_second_order_update(
         lambda_t, lambda_s0, lambda_s1 = (
             self.marginal_lambda(t),
             self.marginal_lambda(s0),
-            self.marginal_lambda(s1), )
+            self.marginal_lambda(s1),
+        )
         log_alpha_t = self.marginal_log_mean_coeff(t)
         sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0)
         h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
@@ -343,19 +332,22 @@ def multistep_dpm_solver_second_order_update(
         if self.config.algorithm_type == "dpmsolver++":
             # See https://arxiv.org/abs/2211.01095 for detailed derivations
             if self.config.solver_type == "midpoint":
-                x_t = ((sigma_t / sigma_s0) * sample -
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 *
-                       (alpha_t * (paddle.exp(-h) - 1.0)) * D1)
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
+                )
             else:
                 raise ValueError
         return x_t
 
     def multistep_dpm_solver_third_order_update(
-            self,
-            model_output_list: List[paddle.Tensor],
-            timestep_list: List[int],
-            prev_timestep: int,
-            sample: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        model_output_list: List[paddle.Tensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: paddle.Tensor,
+    ) -> paddle.Tensor:
         """
         One step for the third-order multistep DPM-Solver.
 
@@ -374,14 +366,15 @@ def multistep_dpm_solver_third_order_update(
             prev_timestep,
             timestep_list[-1],
             timestep_list[-2],
-            timestep_list[-3], )
-        m0, m1, m2 = model_output_list[-1], model_output_list[
-            -2], model_output_list[-3]
+            timestep_list[-3],
+        )
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
         lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
             self.marginal_lambda(t),
             self.marginal_lambda(s0),
             self.marginal_lambda(s1),
-            self.marginal_lambda(s2), )
+            self.marginal_lambda(s2),
+        )
         alpha_t = self.marginal_log_mean_coeff(t)
         alpha_t = paddle.exp(alpha_t)
         sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0)
@@ -393,20 +386,23 @@ def multistep_dpm_solver_third_order_update(
         D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
         if self.config.algorithm_type == "dpmsolver++":
             # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = ((sigma_t / sigma_s0) * sample -
-                   (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * (
-                       (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * (
-                           (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
         else:
             raise ValueError
         return x_t
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the multistep DPM-Solver.
 
@@ -437,59 +433,47 @@ def step(
         if self.method == "multistep":
             if step_index == 0:
                 vec_t = timestep.expand([sample.shape[0]])
-                model_output = self.convert_model_output(model_output, vec_t,
-                                                         sample)
+                model_output = self.convert_model_output(model_output, vec_t, sample)
                 self.noise_prev_list.append(model_output)
                 self.t_prev_list.append(vec_t)
 
             if step_index > 0 and step_index < order:
                 vec_t = timestep.expand([sample.shape[0]])
-                sample = self.dpm_multistep_update(sample, self.noise_prev_list,
-                                                   self.t_prev_list, vec_t,
-                                                   step_index)
-                model_output = self.convert_model_output(model_output, vec_t,
-                                                         sample)
+                sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, step_index)
+                model_output = self.convert_model_output(model_output, vec_t, sample)
                 self.noise_prev_list.append(model_output)
                 self.t_prev_list.append(vec_t)
 
             if step_index >= order and step_index < len(self.timesteps):
                 vec_t = timestep.expand([sample.shape[0]])
-                sample = self.dpm_multistep_update(sample, self.noise_prev_list,
-                                                   self.t_prev_list, vec_t,
-                                                   order)
+                sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, order)
                 for i in range(order - 1):
                     self.t_prev_list[i] = self.t_prev_list[i + 1]
                     self.noise_prev_list[i] = self.noise_prev_list[i + 1]
                 self.t_prev_list[-1] = vec_t
                 if step_index < len(self.timesteps) - 1:
-                    self.noise_prev_list[-1] = self.convert_model_output(
-                        model_output, vec_t, sample)
+                    self.noise_prev_list[-1] = self.convert_model_output(model_output, vec_t, sample)
         else:
             raise ValueError
 
         prev_sample = sample
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
     def dpm_multistep_update(self, x, noise_prev_list, t_prev_list, t, order):
         if order == 1:
-            return self.dpm_solver_first_order_update(noise_prev_list[-1], t,
-                                                      t_prev_list[-1], x)
+            return self.dpm_solver_first_order_update(noise_prev_list[-1], t, t_prev_list[-1], x)
         elif order == 2:
-            return self.multistep_dpm_solver_second_order_update(
-                noise_prev_list, t_prev_list, t, x)
+            return self.multistep_dpm_solver_second_order_update(noise_prev_list, t_prev_list, t, x)
         elif order == 3:
-            return self.multistep_dpm_solver_third_order_update(
-                noise_prev_list, t_prev_list, t, x)
+            return self.multistep_dpm_solver_third_order_update(noise_prev_list, t_prev_list, t, x)
         else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(
-                order))
+            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
index a2a0a495031de..95332c844b137 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -47,8 +47,7 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -108,37 +107,39 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon", ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
 
@@ -147,15 +148,11 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
         self.is_scale_input_called = False
 
-    def scale_model_input(
-            self, sample: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
         """
         Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
 
@@ -168,7 +165,7 @@ def scale_model_input(
         """
         step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
-        sample = sample / ((sigma**2 + 1)**0.5)
+        sample = sample / ((sigma**2 + 1) ** 0.5)
         self.is_scale_input_called = True
         return sample
 
@@ -182,27 +179,21 @@ def set_timesteps(self, num_inference_steps: int):
         """
         self.num_inference_steps = num_inference_steps
 
-        timesteps = np.linspace(
-            0,
-            self.config.num_train_timesteps - 1,
-            num_inference_steps,
-            dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
         self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor],
-            sample: paddle.Tensor,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[
-                EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+        sample: paddle.Tensor,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -224,7 +215,8 @@ def step(
         if not self.is_scale_input_called:
             logger.warning(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example.")
+                "See `StableDiffusionPipeline` for a usage example."
+            )
         step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
 
@@ -233,11 +225,9 @@ def step(
             pred_original_sample = sample - sigma * model_output
         elif self.config.prediction_type == "v_prediction":
             # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5
-                                                   ) + (sample / (sigma**2 + 1))
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
         elif self.config.prediction_type == "sample":
-            raise NotImplementedError(
-                "prediction_type not implemented yet: sample")
+            raise NotImplementedError("prediction_type not implemented yet: sample")
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -245,9 +235,8 @@ def step(
 
         sigma_from = self.sigmas[step_index]
         sigma_to = self.sigmas[step_index + 1]
-        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from
-                    **2)**0.5
-        sigma_down = (sigma_to**2 - sigma_up**2)**0.5
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
 
         # 2. Convert to an ODE derivative
         derivative = (sample - pred_original_sample) / sigma
@@ -256,28 +245,28 @@ def step(
 
         prev_sample = sample + derivative * dt
 
-        noise = randn_tensor(
-            model_output.shape, dtype=model_output.dtype, generator=generator)
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
 
         prev_sample = prev_sample + noise * sigma_up
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return EulerAncestralDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
 
         schedule_timesteps = self.timesteps
-        step_indices = [(schedule_timesteps == t).nonzero().item()
-                        for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
index 8d53b8dd4f3a9..a45e3bf0e5617 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
@@ -66,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -114,39 +114,41 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon",
-            interpolation_type: str="linear",
-            use_karras_sigmas: Optional[bool]=False, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        interpolation_type: str = "linear",
+        use_karras_sigmas: Optional[bool] = False,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
 
@@ -155,16 +157,12 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
         self.is_scale_input_called = False
         self.use_karras_sigmas = use_karras_sigmas
 
-    def scale_model_input(
-            self, sample: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
         """
         Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
 
@@ -178,7 +176,7 @@ def scale_model_input(
         step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
 
-        sample = sample / ((sigma**2 + 1)**0.5)
+        sample = sample / ((sigma**2 + 1) ** 0.5)
 
         self.is_scale_input_called = True
         return sample
@@ -193,31 +191,23 @@ def set_timesteps(self, num_inference_steps: int):
         """
         self.num_inference_steps = num_inference_steps
 
-        timesteps = np.linspace(
-            0,
-            self.config.num_train_timesteps - 1,
-            num_inference_steps,
-            dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         log_sigmas = np.log(sigmas)
 
         if self.config.interpolation_type == "linear":
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         elif self.config.interpolation_type == "log_linear":
-            sigmas = paddle.linspace(
-                np.log(sigmas[-1]), np.log(sigmas[0]),
-                num_inference_steps + 1).exp()
+            sigmas = paddle.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp()
         else:
             raise ValueError(
                 f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either"
-                " 'linear' or 'log_linear'")
+                " 'linear' or 'log_linear'"
+            )
 
         if self.use_karras_sigmas:
-            sigmas = self._convert_to_karras(
-                in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            timesteps = np.array(
-                [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
@@ -231,9 +221,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         dists = log_sigma - log_sigmas[:, np.newaxis]
 
         # get sigmas range
-        low_idx = (np.cumsum(
-            (dists >= 0), axis=0).argmax(axis=0)
-                   .clip(max=log_sigmas.shape[0] - 2))
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
         high_idx = low_idx + 1
 
         low = log_sigmas[low_idx]
@@ -248,8 +236,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    def _convert_to_karras(self, in_sigmas: paddle.Tensor,
-                           num_inference_steps) -> paddle.Tensor:
+    def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
         sigma_min = in_sigmas[-1].item()
@@ -257,24 +244,23 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor,
 
         rho = 7.0  # 7.0 is the value used in the paper
         ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min**(1 / rho)
-        max_inv_rho = sigma_max**(1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
         return sigmas
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor],
-            sample: paddle.Tensor,
-            s_churn: float=0.0,
-            s_tmin: float=0.0,
-            s_tmax: float=float("inf"),
-            s_noise: float=1.0,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[EulerDiscreteSchedulerOutput,
-                                               Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+        sample: paddle.Tensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -301,35 +287,32 @@ def step(
         if not self.is_scale_input_called:
             logger.warning(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example.")
+                "See `StableDiffusionPipeline` for a usage example."
+            )
 
         step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
 
-        gamma = (min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1)
-                 if s_tmin <= sigma <= s_tmax else 0.0)
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
 
-        noise = randn_tensor(
-            model_output.shape, dtype=model_output.dtype, generator=generator)
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
 
         eps = noise * s_noise
         sigma_hat = sigma * (gamma + 1)
 
         if gamma > 0:
-            sample = sample + eps * (sigma_hat**2 - sigma**2)**0.5
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
 
         # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
         # NOTE: "original_sample" should not be an expected prediction_type but is left in for
         # backwards compatibility
-        if (self.config.prediction_type == "original_sample" or
-                self.config.prediction_type == "sample"):
+        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
             pred_original_sample = model_output
         elif self.config.prediction_type == "epsilon":
             pred_original_sample = sample - sigma_hat * model_output
         elif self.config.prediction_type == "v_prediction":
             # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5
-                                                   ) + (sample / (sigma**2 + 1))
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -343,22 +326,21 @@ def step(
         prev_sample = sample + derivative * dt
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
-        return EulerDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
 
         schedule_timesteps = self.timesteps
-        step_indices = [(schedule_timesteps == t).nonzero().item()
-                        for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
index 4cd27a38164ff..05a8673a2a358 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
@@ -20,13 +20,11 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -45,7 +43,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -90,32 +88,35 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.00085,  # sensible defaults
-            beta_end: float=0.012,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon",
-            use_karras_sigmas: Optional[bool]=False, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        use_karras_sigmas: Optional[bool] = False,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -137,9 +138,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     def scale_model_input(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+    ) -> paddle.Tensor:
         """
         Args:
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -151,13 +153,14 @@ def scale_model_input(
         step_index = self.index_for_timestep(timestep)
 
         sigma = self.sigmas[step_index]
-        sample = sample / ((sigma**2 + 1)**0.5)
+        sample = sample / ((sigma**2 + 1) ** 0.5)
         return sample
 
     def set_timesteps(
-            self,
-            num_inference_steps: int,
-            num_train_timesteps: Optional[int]=None, ):
+        self,
+        num_inference_steps: int,
+        num_train_timesteps: Optional[int] = None,
+    ):
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -169,32 +172,25 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_inference_steps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         log_sigmas = np.log(sigmas)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
 
         if self.use_karras_sigmas:
-            sigmas = self._convert_to_karras(
-                in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            timesteps = np.array(
-                [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = paddle.to_tensor(sigmas)
-        self.sigmas = paddle.concat(
-            [sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas = paddle.concat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = self.sigmas.max()
 
         timesteps = paddle.to_tensor(timesteps)
-        timesteps = paddle.concat(
-            [timesteps[:1], timesteps[1:].repeat_interleave(2)])
+        timesteps = paddle.concat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
 
         self.timesteps = timesteps.cast(paddle.float32)
 
@@ -210,9 +206,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         dists = log_sigma - log_sigmas[:, np.newaxis]
 
         # get sigmas range
-        low_idx = (np.cumsum(
-            (dists >= 0), axis=0).argmax(axis=0)
-                   .clip(max=log_sigmas.shape[0] - 2))
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
         high_idx = low_idx + 1
 
         low = log_sigmas[low_idx]
@@ -227,8 +221,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    def _convert_to_karras(self, in_sigmas: paddle.Tensor,
-                           num_inference_steps) -> paddle.Tensor:
+    def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
         sigma_min = in_sigmas[-1].item()
@@ -236,9 +229,9 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor,
 
         rho = 7.0  # 7.0 is the value used in the paper
         ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min**(1 / rho)
-        max_inv_rho = sigma_max**(1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
         return sigmas
 
     @property
@@ -246,11 +239,12 @@ def state_in_first_order(self):
         return self.dt is None
 
     def step(
-            self,
-            model_output: Union[paddle.Tensor, np.ndarray],
-            timestep: Union[float, paddle.Tensor],
-            sample: Union[paddle.Tensor, np.ndarray],
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: Union[paddle.Tensor, np.ndarray],
+        timestep: Union[float, paddle.Tensor],
+        sample: Union[paddle.Tensor, np.ndarray],
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Args:
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -286,11 +280,11 @@ def step(
             pred_original_sample = sample - sigma_input * model_output
         elif self.config.prediction_type == "v_prediction":
             sigma_input = sigma_hat if self.state_in_first_order else sigma_next
-            pred_original_sample = model_output * (-sigma_input / (
-                sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1))
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
         elif self.config.prediction_type == "sample":
-            raise NotImplementedError(
-                "prediction_type not implemented yet: sample")
+            raise NotImplementedError("prediction_type not implemented yet: sample")
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -324,22 +318,21 @@ def step(
         prev_sample = sample + derivative * dt
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
 
         schedule_timesteps = self.timesteps
-        step_indices = [
-            self.index_for_timestep(t, schedule_timesteps) for t in timesteps
-        ]
+        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
index 4d2b87c82ae86..8b8595755cb61 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
@@ -43,9 +43,10 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None, ):
+        self,
+        num_train_timesteps: int = 1000,
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+    ):
         # set `betas`, `alphas`, `timesteps`
         self.set_timesteps(num_train_timesteps)
 
@@ -73,24 +74,23 @@ def set_timesteps(self, num_inference_steps: int):
         steps = paddle.concat([steps, paddle.to_tensor([0.0])])
 
         if self.config.trained_betas is not None:
-            self.betas = paddle.to_tensor(
-                self.config.trained_betas, dtype=paddle.float32)
+            self.betas = paddle.to_tensor(self.config.trained_betas, dtype=paddle.float32)
         else:
-            self.betas = paddle.sin(steps * math.pi / 2)**2
+            self.betas = paddle.sin(steps * math.pi / 2) ** 2
 
-        self.alphas = (1.0 - self.betas**2)**0.5
+        self.alphas = (1.0 - self.betas**2) ** 0.5
 
-        self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi *
-                          2)[:-1]
+        self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
 
         self.ets = []
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
         times to approximate the solution.
@@ -119,8 +119,7 @@ def step(
         timestep_index = (self.timesteps == timestep).nonzero().item()
         prev_timestep_index = timestep_index + 1
 
-        ets = (sample * self.betas[timestep_index] + model_output *
-               self.alphas[timestep_index])
+        ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
         self.ets.append(ets)
 
         if len(self.ets) == 1:
@@ -128,22 +127,18 @@ def step(
         elif len(self.ets) == 2:
             ets = (3 * self.ets[-1] - self.ets[-2]) / 2
         elif len(self.ets) == 3:
-            ets = (
-                23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+            ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
         else:
-            ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 *
-                              self.ets[-3] - 9 * self.ets[-4])
+            ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
 
-        prev_sample = self._get_prev_sample(sample, timestep_index,
-                                            prev_timestep_index, ets)
+        prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -156,8 +151,7 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
         """
         return sample
 
-    def _get_prev_sample(self, sample, timestep_index, prev_timestep_index,
-                         ets):
+    def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
         alpha = self.alphas[timestep_index]
         sigma = self.betas[timestep_index]
 
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 937c161348c12..9857a57444941 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -21,13 +21,11 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import randn_tensor
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -46,7 +44,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -88,31 +86,34 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.00085,  # sensible defaults
-            beta_end: float=0.012,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon", ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -133,9 +134,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     def scale_model_input(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+    ) -> paddle.Tensor:
         """
         Args:
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -151,13 +153,14 @@ def scale_model_input(
         else:
             sigma = self.sigmas_interpol[step_index - 1]
 
-        sample = sample / ((sigma**2 + 1)**0.5)
+        sample = sample / ((sigma**2 + 1) ** 0.5)
         return sample
 
     def set_timesteps(
-            self,
-            num_inference_steps: int,
-            num_train_timesteps: Optional[int]=None, ):
+        self,
+        num_inference_steps: int,
+        num_train_timesteps: Optional[int] = None,
+    ):
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -169,12 +172,9 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_inference_steps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32)
 
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
@@ -184,9 +184,8 @@ def set_timesteps(
         # compute up and down sigmas
         sigmas_next = sigmas.roll(-1)
         sigmas_next[-1] = 0.0
-        sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas
-                     **2)**0.5
-        sigmas_down = (sigmas_next**2 - sigmas_up**2)**0.5
+        sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5
+        sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5
         sigmas_down[-1] = 0.0
 
         # compute interpolated sigmas
@@ -194,20 +193,16 @@ def set_timesteps(
         sigmas_interpol[-2:] = 0.0
 
         # set sigmas
-        self.sigmas = paddle.concat(
-            [sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
-        self.sigmas_interpol = paddle.concat([
-            sigmas_interpol[:1],
-            sigmas_interpol[1:].repeat_interleave(2),
-            sigmas_interpol[-1:],
-        ])
-        self.sigmas_up = paddle.concat([
-            sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]
-        ])
-        self.sigmas_down = paddle.concat([
-            sigmas_down[:1], sigmas_down[1:].repeat_interleave(2),
-            sigmas_down[-1:]
-        ])
+        self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas_interpol = paddle.concat(
+            [
+                sigmas_interpol[:1],
+                sigmas_interpol[1:].repeat_interleave(2),
+                sigmas_interpol[-1:],
+            ]
+        )
+        self.sigmas_up = paddle.concat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]])
+        self.sigmas_down = paddle.concat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]])
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = self.sigmas.max()
@@ -215,12 +210,9 @@ def set_timesteps(
         timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
 
         timesteps_interpol = self.sigma_to_t(sigmas_interpol)
-        timesteps_interpol = paddle.cast(
-            timesteps_interpol, dtype=timesteps.dtype)
+        timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype)
 
-        interleaved_timesteps = paddle.stack(
-            (timesteps_interpol[:-2, None], timesteps[1:, None]),
-            axis=-1).flatten()
+        interleaved_timesteps = paddle.stack((timesteps_interpol[:-2, None], timesteps[1:, None]), axis=-1).flatten()
 
         self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps])
 
@@ -234,8 +226,7 @@ def sigma_to_t(self, sigma):
         dists = log_sigma - self.log_sigmas[:, None]
 
         # get sigmas range
-        low_idx = ((dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0)
-                   .clip(max=self.log_sigmas.shape[0] - 2))
+        low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2)
         high_idx = low_idx + 1
 
         low = self.log_sigmas[low_idx]
@@ -255,13 +246,13 @@ def state_in_first_order(self):
         return self.sample is None
 
     def step(
-            self,
-            model_output: Union[paddle.Tensor, np.ndarray],
-            timestep: Union[float, paddle.Tensor],
-            sample: Union[paddle.Tensor, np.ndarray],
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: Union[paddle.Tensor, np.ndarray],
+        timestep: Union[float, paddle.Tensor],
+        sample: Union[paddle.Tensor, np.ndarray],
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Args:
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -295,8 +286,7 @@ def step(
         gamma = 0
         sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
 
-        noise = randn_tensor(
-            model_output.shape, dtype=model_output.dtype, generator=generator)
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
 
         # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
         if self.config.prediction_type == "epsilon":
@@ -304,11 +294,11 @@ def step(
             pred_original_sample = sample - sigma_input * model_output
         elif self.config.prediction_type == "v_prediction":
             sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
-            pred_original_sample = model_output * (-sigma_input / (
-                sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1))
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
         elif self.config.prediction_type == "sample":
-            raise NotImplementedError(
-                "prediction_type not implemented yet: sample")
+            raise NotImplementedError("prediction_type not implemented yet: sample")
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -338,22 +328,21 @@ def step(
             prev_sample = prev_sample + noise * sigma_up
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
 
         schedule_timesteps = self.timesteps
-        step_indices = [
-            self.index_for_timestep(t, schedule_timesteps) for t in timesteps
-        ]
+        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
index b6df7c60c3000..87790b6ece926 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -20,13 +20,11 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps,
-                        max_beta=0.999) -> paddle.Tensor:
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -45,7 +43,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps,
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -87,31 +85,34 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.00085,  # sensible defaults
-            beta_end: float=0.012,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon", ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -132,9 +133,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     def scale_model_input(
-            self,
-            sample: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor:
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+    ) -> paddle.Tensor:
         """
         Args:
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -150,13 +152,14 @@ def scale_model_input(
         else:
             sigma = self.sigmas_interpol[step_index]
 
-        sample = sample / ((sigma**2 + 1)**0.5)
+        sample = sample / ((sigma**2 + 1) ** 0.5)
         return sample
 
     def set_timesteps(
-            self,
-            num_inference_steps: int,
-            num_train_timesteps: Optional[int]=None, ):
+        self,
+        num_inference_steps: int,
+        num_train_timesteps: Optional[int] = None,
+    ):
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -168,12 +171,9 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_inference_steps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32)
 
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
@@ -185,13 +185,14 @@ def set_timesteps(
         # must set to 0.0
         sigmas_interpol[-1] = 0.0
 
-        self.sigmas = paddle.concat(
-            [sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
-        self.sigmas_interpol = paddle.concat([
-            sigmas_interpol[:1],
-            sigmas_interpol[1:].repeat_interleave(2),
-            sigmas_interpol[-1:],
-        ])
+        self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas_interpol = paddle.concat(
+            [
+                sigmas_interpol[:1],
+                sigmas_interpol[1:].repeat_interleave(2),
+                sigmas_interpol[-1:],
+            ]
+        )
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = self.sigmas.max()
@@ -199,11 +200,8 @@ def set_timesteps(
         timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
         # interpolate timesteps
         timesteps_interpol = self.sigma_to_t(sigmas_interpol)
-        timesteps_interpol = paddle.cast(
-            timesteps_interpol, dtype=timesteps.dtype)
-        interleaved_timesteps = paddle.stack(
-            (timesteps_interpol[1:-1, None], timesteps[1:, None]),
-            axis=-1).flatten()
+        timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype)
+        interleaved_timesteps = paddle.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), axis=-1).flatten()
 
         self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps])
 
@@ -217,8 +215,7 @@ def sigma_to_t(self, sigma):
         dists = log_sigma - self.log_sigmas[:, None]
 
         # get sigmas range
-        low_idx = ((dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0)
-                   .clip(max=self.log_sigmas.shape[0] - 2))
+        low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2)
         high_idx = low_idx + 1
 
         low = self.log_sigmas[low_idx]
@@ -238,11 +235,12 @@ def state_in_first_order(self):
         return self.sample is None
 
     def step(
-            self,
-            model_output: Union[paddle.Tensor, np.ndarray],
-            timestep: Union[float, paddle.Tensor],
-            sample: Union[paddle.Tensor, np.ndarray],
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: Union[paddle.Tensor, np.ndarray],
+        timestep: Union[float, paddle.Tensor],
+        sample: Union[paddle.Tensor, np.ndarray],
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Args:
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -280,11 +278,11 @@ def step(
             pred_original_sample = sample - sigma_input * model_output
         elif self.config.prediction_type == "v_prediction":
             sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
-            pred_original_sample = model_output * (-sigma_input / (
-                sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1))
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
         elif self.config.prediction_type == "sample":
-            raise NotImplementedError(
-                "prediction_type not implemented yet: sample")
+            raise NotImplementedError("prediction_type not implemented yet: sample")
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
@@ -312,22 +310,21 @@ def step(
         prev_sample = sample + derivative * dt
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
 
         schedule_timesteps = self.timesteps
-        step_indices = [
-            self.index_for_timestep(t, schedule_timesteps) for t in timesteps
-        ]
+        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
index ba4bf176efd6c..f104b1a69a8d9 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
@@ -81,13 +81,14 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            sigma_min: float=0.02,
-            sigma_max: float=100,
-            s_noise: float=1.007,
-            s_churn: float=80,
-            s_min: float=0.05,
-            s_max: float=50, ):
+        self,
+        sigma_min: float = 0.02,
+        sigma_max: float = 100,
+        s_noise: float = 1.007,
+        s_churn: float = 80,
+        s_min: float = 0.05,
+        s_max: float = 50,
+    ):
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = sigma_max
 
@@ -96,9 +97,7 @@ def __init__(
         self.timesteps: paddle.Tensor = None
         self.schedule: paddle.Tensor = None  # sigma(t_i)
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -124,17 +123,21 @@ def set_timesteps(self, num_inference_steps: int):
         self.num_inference_steps = num_inference_steps
         timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps)
-        schedule = [(self.config.sigma_max
-                     **2 * (self.config.sigma_min**2 / self.config.sigma_max**2)
-                     **(i / (num_inference_steps - 1))) for i in self.timesteps]
+        schedule = [
+            (
+                self.config.sigma_max**2
+                * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+            )
+            for i in self.timesteps
+        ]
         self.schedule = paddle.to_tensor(schedule, dtype=paddle.float32)
 
     def add_noise_to_input(
-            self,
-            sample: paddle.Tensor,
-            sigma: float,
-            generator: Optional[paddle.Generator]=None, ) -> Tuple[
-                paddle.Tensor, float]:
+        self,
+        sample: paddle.Tensor,
+        sigma: float,
+        generator: Optional[paddle.Generator] = None,
+    ) -> Tuple[paddle.Tensor, float]:
         """
         Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
         higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
@@ -142,26 +145,25 @@ def add_noise_to_input(
         TODO Args:
         """
         if self.config.s_min <= sigma <= self.config.s_max:
-            gamma = min(self.config.s_churn / self.num_inference_steps,
-                        2**0.5 - 1)
+            gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
         else:
             gamma = 0
 
         # sample eps ~ N(0, S_noise^2 * I)
-        eps = self.config.s_noise * randn_tensor(
-            sample.shape, generator=generator)
+        eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator)
         sigma_hat = sigma + gamma * sigma
-        sample_hat = sample + ((sigma_hat**2 - sigma**2)**0.5 * eps)
+        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
 
         return sample_hat, sigma_hat
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            sigma_hat: float,
-            sigma_prev: float,
-            sample_hat: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[KarrasVeOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[KarrasVeOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -191,17 +193,19 @@ def step(
         return KarrasVeOutput(
             prev_sample=sample_prev,
             derivative=derivative,
-            pred_original_sample=pred_original_sample, )
+            pred_original_sample=pred_original_sample,
+        )
 
     def step_correct(
-            self,
-            model_output: paddle.Tensor,
-            sigma_hat: float,
-            sigma_prev: float,
-            sample_hat: paddle.Tensor,
-            sample_prev: paddle.Tensor,
-            derivative: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[KarrasVeOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: paddle.Tensor,
+        sample_prev: paddle.Tensor,
+        derivative: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[KarrasVeOutput, Tuple]:
         """
         Correct the predicted sample based on the output model_output of the network. TODO complete description
 
@@ -220,8 +224,7 @@ def step_correct(
         """
         pred_original_sample = sample_prev + sigma_prev * model_output
         derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
-        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (
-            0.5 * derivative + 0.5 * derivative_corr)
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
 
         if not return_dict:
             return (sample_prev, derivative)
@@ -229,7 +232,8 @@ def step_correct(
         return KarrasVeOutput(
             prev_sample=sample_prev,
             derivative=derivative,
-            pred_original_sample=pred_original_sample, )
+            pred_original_sample=pred_original_sample,
+        )
 
     def add_noise(self, original_samples, noise, timesteps):
         raise NotImplementedError()
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
index 872f3891e0cf4..122b5e8dffa7d 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
@@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -106,37 +106,39 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            prediction_type: str="epsilon", ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
 
@@ -145,16 +147,12 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=float)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
         self.derivatives = []
         self.is_scale_input_called = False
 
-    def scale_model_input(
-            self, sample: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
         """
         Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
 
@@ -167,7 +165,7 @@ def scale_model_input(
         """
         step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
-        sample = sample / ((sigma**2 + 1)**0.5)
+        sample = sample / ((sigma**2 + 1) ** 0.5)
         self.is_scale_input_called = True
         return sample
 
@@ -186,12 +184,10 @@ def lms_derivative(tau):
             for k in range(order):
                 if current_order == k:
                     continue
-                prod *= (tau - self.sigmas[t - k]) / (
-                    self.sigmas[t - current_order] - self.sigmas[t - k])
+                prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
             return prod
 
-        integrated_coeff = integrate.quad(
-            lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+        integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
 
         return integrated_coeff
 
@@ -205,13 +201,8 @@ def set_timesteps(self, num_inference_steps: int):
         """
         self.num_inference_steps = num_inference_steps
 
-        timesteps = np.linspace(
-            0,
-            self.config.num_train_timesteps - 1,
-            num_inference_steps,
-            dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)**
-                          0.5)
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = paddle.to_tensor(sigmas)
@@ -220,13 +211,13 @@ def set_timesteps(self, num_inference_steps: int):
         self.derivatives = []
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: Union[float, paddle.Tensor],
-            sample: paddle.Tensor,
-            order: int=4,
-            return_dict: bool=True, ) -> Union[LMSDiscreteSchedulerOutput,
-                                               Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: Union[float, paddle.Tensor],
+        sample: paddle.Tensor,
+        order: int = 4,
+        return_dict: bool = True,
+    ) -> Union[LMSDiscreteSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -248,7 +239,8 @@ def step(
         if not self.is_scale_input_called:
             warnings.warn(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example.")
+                "See `StableDiffusionPipeline` for a usage example."
+            )
 
         step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
@@ -258,8 +250,7 @@ def step(
             pred_original_sample = sample - sigma * model_output
         elif self.config.prediction_type == "v_prediction":
             # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5
-                                                   ) + (sample / (sigma**2 + 1))
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
         else:
@@ -275,33 +266,29 @@ def step(
 
         # 3. Compute linear multistep coefficients
         order = min(step_index + 1, order)
-        lms_coeffs = [
-            self.get_lms_coefficient(order, step_index, curr_order)
-            for curr_order in range(order)
-        ]
+        lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
 
         # 4. Compute previous sample based on the derivatives path
-        prev_sample = sample + sum(coeff * derivative
-                                   for coeff, derivative in zip(
-                                       lms_coeffs, reversed(self.derivatives)))
+        prev_sample = sample + sum(
+            coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+        )
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
-        return LMSDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+        return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.sigmas.cast(original_samples.dtype)
         schedule_timesteps = self.timesteps
 
-        step_indices = [(schedule_timesteps == t).nonzero().item()
-                        for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
index 437f108e73af3..c821dae87d35d 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
@@ -22,8 +22,7 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -99,40 +98,42 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            skip_prk_steps: bool=False,
-            set_alpha_to_one: bool=False,
-            prediction_type: str="epsilon",
-            steps_offset: int=0, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        skip_prk_steps: bool = False,
+        set_alpha_to_one: bool = False,
+        prediction_type: str = "epsilon",
+        steps_offset: int = 0,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
 
-        self.final_alpha_cumprod = (paddle.to_tensor(1.0) if set_alpha_to_one
-                                    else self.alphas_cumprod[0])
+        self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
@@ -168,8 +169,7 @@ def set_timesteps(self, num_inference_steps: int):
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
         # creates integer timesteps by multiplying by ratio
         # casting to int to avoid issues when num_inference_step is power of 3
-        self._timesteps = (np.arange(0, num_inference_steps) *
-                           step_ratio).round()
+        self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
         self._timesteps += self.config.steps_offset
 
         if self.config.skip_prk_steps:
@@ -177,25 +177,20 @@ def set_timesteps(self, num_inference_steps: int):
             # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
             # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
             self.prk_timesteps = np.array([])
-            self.plms_timesteps = np.concatenate([
-                self._timesteps[:-1], self._timesteps[-2:-1],
-                self._timesteps[-1:]
-            ])[::-1].copy()
+            self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[
+                ::-1
+            ].copy()
         else:
-            prk_timesteps = np.array(self._timesteps[-self.pndm_order:]).repeat(
-                2) + np.tile(
-                    np.array([
-                        0, self.config.num_train_timesteps //
-                        num_inference_steps // 2
-                    ]),
-                    self.pndm_order, )
-            self.prk_timesteps = (
-                prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
-            self.plms_timesteps = self._timesteps[:-3][::-1].copy(
-            )  # we copy to avoid having negative strides which are not supported by paddle
-
-        timesteps = np.concatenate(
-            [self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
+            prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile(
+                np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]),
+                self.pndm_order,
+            )
+            self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
+            self.plms_timesteps = self._timesteps[:-3][
+                ::-1
+            ].copy()  # we copy to avoid having negative strides which are not supported by paddle
+
+        timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
         self.timesteps = paddle.to_tensor(timesteps)
 
         self.ets = []
@@ -203,11 +198,12 @@ def set_timesteps(self, num_inference_steps: int):
         self.cur_model_output = 0
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -227,26 +223,28 @@ def step(
             returning a tuple, the first element is the sample tensor.
 
         """
-        if self.counter < len(
-                self.prk_timesteps) and not self.config.skip_prk_steps:
+        if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps:
             return self.step_prk(
                 model_output=model_output,
                 timestep=timestep,
                 sample=sample,
-                return_dict=return_dict, )
+                return_dict=return_dict,
+            )
         else:
             return self.step_plms(
                 model_output=model_output,
                 timestep=timestep,
                 sample=sample,
-                return_dict=return_dict, )
+                return_dict=return_dict,
+            )
 
     def step_prk(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
         solution to the differential equation.
@@ -268,9 +266,7 @@ def step_prk(
                 "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
             )
 
-        diff_to_prev = (0
-                        if self.counter % 2 else self.config.num_train_timesteps
-                        // self.num_inference_steps // 2)
+        diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
         prev_timestep = timestep - diff_to_prev
         timestep = self.prk_timesteps[self.counter // 4 * 4]
 
@@ -289,21 +285,21 @@ def step_prk(
         # cur_sample should not be `None`
         cur_sample = self.cur_sample if self.cur_sample is not None else sample
 
-        prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep,
-                                            model_output)
+        prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
         self.counter += 1
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
     def step_plms(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
         times to approximate the solution.
@@ -330,18 +326,17 @@ def step_plms(
                 f"{self.__class__} can only be run AFTER scheduler has been run "
                 "in 'prk' mode for at least 12 iterations "
                 "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
-                "for more information.")
+                "for more information."
+            )
 
-        prev_timestep = (timestep - self.config.num_train_timesteps //
-                         self.num_inference_steps)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
 
         if self.counter != 1:
             self.ets = self.ets[-3:]
             self.ets.append(model_output)
         else:
             prev_timestep = timestep
-            timestep = (timestep + self.config.num_train_timesteps //
-                        self.num_inference_steps)
+            timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
 
         if len(self.ets) == 1 and self.counter == 0:
             model_output = model_output
@@ -353,23 +348,19 @@ def step_plms(
         elif len(self.ets) == 2:
             model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
         elif len(self.ets) == 3:
-            model_output = (
-                23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+            model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
         else:
-            model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] +
-                                       37 * self.ets[-3] - 9 * self.ets[-4])
+            model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
 
-        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep,
-                                            model_output)
+        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
         self.counter += 1
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -396,14 +387,12 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
         # model_output -> e_θ(x_t, t)
         # prev_sample -> x_(t−δ)
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
-                             prev_timestep >= 0 else self.final_alpha_cumprod)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
         if self.config.prediction_type == "v_prediction":
-            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**
-                                                                 0.5) * sample
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         elif self.config.prediction_type != "epsilon":
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
@@ -413,41 +402,41 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
         # denominator of x_t in formula (9) and plus 1
         # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
         # sqrt(α_(t−δ)) / sqrt(α_t))
-        sample_coeff = (alpha_prod_t_prev / alpha_prod_t)**(0.5)
+        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
 
         # corresponds to denominator of e_θ(x_t, t) in formula (9)
-        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev**(0.5) + (
-            alpha_prod_t * beta_prod_t * alpha_prod_t_prev)**(0.5)
+        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+        ) ** (0.5)
 
         # full formula (9)
-        prev_sample = (sample_coeff * sample -
-                       (alpha_prod_t_prev - alpha_prod_t
-                        ) * model_output / model_output_denom_coeff)
+        prev_sample = (
+            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+        )
 
         return prev_sample
 
     # Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
     def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
index 71460c026a92b..d040c40ba5124 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
@@ -64,7 +64,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -109,26 +109,30 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            eta: float=0.0,
-            trained_betas: Optional[np.ndarray]=None,
-            clip_sample: bool=True, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        eta: float = 0.0,
+        trained_betas: Optional[np.ndarray] = None,
+        clip_sample: bool = True,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -137,8 +141,7 @@ def __init__(
             betas = paddle.linspace(-6, 6, num_train_timesteps)
             self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -151,14 +154,11 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(
-            np.arange(0, num_train_timesteps)[::-1].copy())
+        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
 
         self.eta = eta
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -173,12 +173,12 @@ def scale_model_input(self,
         return sample
 
     def set_timesteps(
-            self,
-            num_inference_steps: int,
-            jump_length: int=10,
-            jump_n_sample: int=10, ):
-        num_inference_steps = min(self.config.num_train_timesteps,
-                                  num_inference_steps)
+        self,
+        num_inference_steps: int,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+    ):
+        num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
         self.num_inference_steps = num_inference_steps
 
         timesteps = []
@@ -198,16 +198,14 @@ def set_timesteps(
                     t = t + 1
                     timesteps.append(t)
 
-        timesteps = np.array(timesteps) * (self.config.num_train_timesteps //
-                                           self.num_inference_steps)
+        timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
         self.timesteps = paddle.to_tensor(timesteps)
 
     def _get_variance(self, t):
         prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
 
         alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
-                             prev_timestep >= 0 else self.final_alpha_cumprod)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
@@ -218,21 +216,20 @@ def _get_variance(self, t):
         # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
         # without eta.
         # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
-        variance = (beta_prod_t_prev / beta_prod_t) * (
-            1 - alpha_prod_t / alpha_prod_t_prev)
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
 
         return variance
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            original_image: paddle.Tensor,
-            mask: paddle.Tensor,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[RePaintSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        original_image: paddle.Tensor,
+        mask: paddle.Tensor,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[RePaintSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -258,19 +255,16 @@ def step(
 
         """
         t = timestep
-        prev_timestep = (timestep - self.config.num_train_timesteps //
-                         self.num_inference_steps)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
 
         # 1. compute alphas, betas
         alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if
-                             prev_timestep >= 0 else self.final_alpha_cumprod)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample = (
-            sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:
@@ -284,9 +278,8 @@ def step(
         # been observed.
 
         # 5. Add noise
-        noise = randn_tensor(
-            model_output.shape, generator=generator, dtype=model_output.dtype)
-        std_dev_t = self.eta * self._get_variance(timestep)**0.5
+        noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
+        std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
 
         variance = 0
         if t > 0 and self.eta > 0:
@@ -294,51 +287,44 @@ def step(
 
         # 6. compute "direction pointing to x_t" of formula (12)
         # from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (
-            1 - alpha_prod_t_prev - std_dev_t**2)**0.5 * model_output
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
 
         # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_unknown_part = (alpha_prod_t_prev**0.5 * pred_original_sample +
-                             pred_sample_direction + variance)
+        prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
 
         # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
-        prev_known_part = (alpha_prod_t_prev**0.5) * original_image + (
-            (1 - alpha_prod_t_prev)**0.5) * noise
+        prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ((1 - alpha_prod_t_prev) ** 0.5) * noise
 
         # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
-        pred_prev_sample = mask * prev_known_part + (1.0 - mask
-                                                     ) * prev_unknown_part
+        pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
 
         if not return_dict:
             return (
                 pred_prev_sample,
-                pred_original_sample, )
+                pred_original_sample,
+            )
 
-        return RePaintSchedulerOutput(
-            prev_sample=pred_prev_sample,
-            pred_original_sample=pred_original_sample)
+        return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
 
     def undo_step(self, sample, timestep, generator=None):
         n = self.config.num_train_timesteps // self.num_inference_steps
 
         for i in range(n):
             beta = self.betas[timestep + i]
-            noise = randn_tensor(
-                sample.shape, generator=generator, dtype=sample.dtype)
+            noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype)
 
             # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
-            sample = (1 - beta)**0.5 * sample + beta**0.5 * noise
+            sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
 
         return sample
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
-        raise NotImplementedError(
-            "Use `DDPMScheduler.add_noise()` to train for sampling with RePaint."
-        )
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
+        raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
 
     def __len__(self):
         return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
index 3513d6691d0e5..83644fdecc48a 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
@@ -71,13 +71,14 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=2000,
-            snr: float=0.15,
-            sigma_min: float=0.01,
-            sigma_max: float=1348.0,
-            sampling_eps: float=1e-5,
-            correct_steps: int=1, ):
+        self,
+        num_train_timesteps: int = 2000,
+        snr: float = 0.15,
+        sigma_min: float = 0.01,
+        sigma_max: float = 1348.0,
+        sampling_eps: float = 1e-5,
+        correct_steps: int = 1,
+    ):
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = sigma_max
 
@@ -86,9 +87,7 @@ def __init__(
 
         self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -102,7 +101,7 @@ def scale_model_input(self,
         """
         return sample
 
-    def set_timesteps(self, num_inference_steps: int, sampling_eps: float=None):
+    def set_timesteps(self, num_inference_steps: int, sampling_eps: float = None):
         """
         Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -113,17 +112,17 @@ def set_timesteps(self, num_inference_steps: int, sampling_eps: float=None):
                 final timestep value (overrides value given at Scheduler instantiation).
 
         """
-        sampling_eps = (sampling_eps if sampling_eps is not None else
-                        self.config.sampling_eps)
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
 
         self.timesteps = paddle.linspace(1, sampling_eps, num_inference_steps)
 
     def set_sigmas(
-            self,
-            num_inference_steps: int,
-            sigma_min: float=None,
-            sigma_max: float=None,
-            sampling_eps: float=None, ):
+        self,
+        num_inference_steps: int,
+        sigma_min: float = None,
+        sigma_max: float = None,
+        sampling_eps: float = None,
+    ):
         """
         Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
 
@@ -142,33 +141,31 @@ def set_sigmas(
         """
         sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
         sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
-        sampling_eps = (sampling_eps if sampling_eps is not None else
-                        self.config.sampling_eps)
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
         if self.timesteps is None:
             self.set_timesteps(num_inference_steps, sampling_eps)
 
-        self.sigmas = sigma_min * (sigma_max / sigma_min)**(self.timesteps /
-                                                            sampling_eps)
+        self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps)
         self.discrete_sigmas = paddle.exp(
-            paddle.linspace(
-                math.log(sigma_min), math.log(sigma_max), num_inference_steps))
-        self.sigmas = paddle.to_tensor(
-            [sigma_min * (sigma_max / sigma_min)**t for t in self.timesteps])
+            paddle.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps)
+        )
+        self.sigmas = paddle.to_tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
 
     def get_adjacent_sigma(self, timesteps, t):
         return paddle.where(
             timesteps == 0,
             paddle.zeros_like(t),
-            self.discrete_sigmas[timesteps - 1], )
+            self.discrete_sigmas[timesteps - 1],
+        )
 
     def step_pred(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[SdeVeOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[SdeVeOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -191,15 +188,13 @@ def step_pred(
                 "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
             )
 
-        timestep = timestep * paddle.ones(
-            (sample.shape[0],
-             ))  # paddle.repeat_interleave(timestep, sample.shape[0])
+        timestep = timestep * paddle.ones((sample.shape[0],))  # paddle.repeat_interleave(timestep, sample.shape[0])
         timesteps = (timestep * (len(self.timesteps) - 1)).cast("int64")
 
         sigma = self.discrete_sigmas[timesteps]
         adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep)
         drift = paddle.zeros_like(sample)
-        diffusion = (sigma**2 - adjacent_sigma**2)**0.5
+        diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
 
         # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
         # also equation 47 shows the analog from SDE models to ancestral sampling methods
@@ -209,28 +204,23 @@ def step_pred(
         drift = drift - diffusion**2 * model_output
 
         #  equation 6: sample noise for the diffusion term of
-        noise = randn_tensor(
-            sample.shape, generator=generator, dtype=sample.dtype)
-        prev_sample_mean = (
-            sample - drift
-        )  # subtract because `dt` is a small negative timestep
+        noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype)
+        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
         # TODO is the variable diffusion the correct scaling term for the noise?
-        prev_sample = (prev_sample_mean + diffusion * noise
-                       )  # add impact of diffusion field g
+        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
 
         if not return_dict:
             return (prev_sample, prev_sample_mean)
 
-        return SdeVeOutput(
-            prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
+        return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
 
     def step_correct(
-            self,
-            model_output: paddle.Tensor,
-            sample: paddle.Tensor,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        sample: paddle.Tensor,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
         after making the prediction for the previous timestep.
@@ -257,12 +247,10 @@ def step_correct(
         noise = randn_tensor(sample.shape, generator=generator)
 
         # compute step size from the model_output, the noise, and the snr
-        grad_norm = paddle.norm(
-            model_output.reshape([model_output.shape[0], -1]), axis=-1).mean()
-        noise_norm = paddle.norm(
-            noise.reshape([noise.shape[0], -1]), axis=-1).mean()
-        step_size = (self.config.snr * noise_norm / grad_norm)**2 * 2
-        step_size = step_size * paddle.ones((sample.shape[0], ))
+        grad_norm = paddle.norm(model_output.reshape([model_output.shape[0], -1]), axis=-1).mean()
+        noise_norm = paddle.norm(noise.reshape([noise.shape[0], -1]), axis=-1).mean()
+        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+        step_size = step_size * paddle.ones((sample.shape[0],))
         # self.repeat_scalar(step_size, sample.shape[0])
 
         # compute corrected sample: model_output term and noise term
@@ -270,23 +258,22 @@ def step_correct(
         while len(step_size.shape) < len(sample.shape):
             step_size = step_size.unsqueeze(-1)
         prev_sample_mean = sample + step_size * model_output
-        prev_sample = prev_sample_mean + ((step_size * 2)**0.5) * noise
+        prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure sigmas and timesteps have the same dtype as original_samples
         sigmas = self.discrete_sigmas[timesteps]
-        noise = (paddle.randn(
-            original_samples.shape,
-            dtype=original_samples.dtype) * sigmas[:, None, None, None])
+        noise = paddle.randn(original_samples.shape, dtype=original_samples.dtype) * sigmas[:, None, None, None]
         noisy_samples = noise + original_samples
         return noisy_samples
 
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
index 23b4303cbf257..c0e1eebc3eb96 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
@@ -42,18 +42,13 @@ class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
     order = 1
 
     @register_to_config
-    def __init__(self,
-                 num_train_timesteps=2000,
-                 beta_min=0.1,
-                 beta_max=20,
-                 sampling_eps=1e-3):
+    def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
         self.sigmas = None
         self.discrete_sigmas = None
         self.timesteps = None
 
     def set_timesteps(self, num_inference_steps):
-        self.timesteps = paddle.linspace(1, self.config.sampling_eps,
-                                         num_inference_steps)
+        self.timesteps = paddle.linspace(1, self.config.sampling_eps, num_inference_steps)
 
     def step_pred(self, score, x, t, generator=None):
         if self.timesteps is None:
@@ -63,9 +58,9 @@ def step_pred(self, score, x, t, generator=None):
 
         # TODO(Patrick) better comments + non-Paddle
         # postprocess model score
-        log_mean_coeff = (-0.25 * t**2 *
-                          (self.config.beta_max - self.config.beta_min
-                           ) - 0.5 * t * self.config.beta_min)
+        log_mean_coeff = (
+            -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+        )
         std = paddle.sqrt(1.0 - paddle.exp(2.0 * log_mean_coeff))
         std = std.flatten()
         while len(std.shape) < len(score.shape):
@@ -75,8 +70,7 @@ def step_pred(self, score, x, t, generator=None):
         # compute
         dt = -1.0 / len(self.timesteps)
 
-        beta_t = self.config.beta_min + t * (self.config.beta_max -
-                                             self.config.beta_min)
+        beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
         beta_t = beta_t.flatten()
         while len(beta_t.shape) < len(x.shape):
             beta_t = beta_t.unsqueeze(-1)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
index 8b809e90c7159..491409f76a5e6 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
@@ -64,7 +64,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -102,17 +102,16 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            variance_type: str="fixed_small_log",
-            clip_sample: bool=True,
-            clip_sample_range: Optional[float]=1.0,
-            prediction_type: str="epsilon",
-            beta_schedule: str="squaredcos_cap_v2", ):
+        self,
+        num_train_timesteps: int = 1000,
+        variance_type: str = "fixed_small_log",
+        clip_sample: bool = True,
+        clip_sample_range: Optional[float] = 1.0,
+        prediction_type: str = "epsilon",
+        beta_schedule: str = "squaredcos_cap_v2",
+    ):
         if beta_schedule != "squaredcos_cap_v2":
-            raise ValueError(
-                "UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'"
-            )
+            raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
 
         self.betas = betas_for_alpha_bar(num_train_timesteps)
 
@@ -125,14 +124,11 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(
-            np.arange(0, num_train_timesteps)[::-1].copy())
+        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
 
         self.variance_type = variance_type
 
-    def scale_model_input(self,
-                          sample: paddle.Tensor,
-                          timestep: Optional[int]=None) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -159,23 +155,16 @@ def set_timesteps(self, num_inference_steps: int):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
         self.num_inference_steps = num_inference_steps
-        step_ratio = (self.config.num_train_timesteps - 1) / (
-            self.num_inference_steps - 1)
-        timesteps = ((np.arange(0, num_inference_steps) * step_ratio)
-                     .round()[::-1].copy().astype(np.int64))
+        step_ratio = (self.config.num_train_timesteps - 1) / (self.num_inference_steps - 1)
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = paddle.to_tensor(timesteps)
 
-    def _get_variance(self,
-                      t,
-                      prev_timestep=None,
-                      predicted_variance=None,
-                      variance_type=None):
+    def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
         if prev_timestep is None:
             prev_timestep = t - 1
 
         alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep]
-                             if prev_timestep >= 0 else self.one)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
@@ -207,13 +196,14 @@ def _get_variance(self,
         return variance
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            prev_timestep: Optional[int]=None,
-            generator=None,
-            return_dict: bool=True, ) -> Union[UnCLIPSchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        prev_timestep: Optional[int] = None,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[UnCLIPSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
@@ -236,12 +226,11 @@ def step(
         """
         t = timestep
 
-        if (model_output.shape[1] == sample.shape[1] * 2 and
-                self.variance_type == "learned_range"):
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type == "learned_range":
             # must split like this, 3 -> split 2 -> [2, 1]
             model_output, predicted_variance = model_output.split(
-                [sample.shape[1], model_output.shape[1] - sample.shape[1]],
-                axis=1)
+                [sample.shape[1], model_output.shape[1] - sample.shape[1]], axis=1
+            )
         else:
             predicted_variance = None
 
@@ -250,8 +239,7 @@ def step(
             prev_timestep = t - 1
 
         alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep]
-                             if prev_timestep >= 0 else self.one)
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
@@ -265,32 +253,31 @@ def step(
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
         if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t**
-                                    (0.5) * model_output) / alpha_prod_t**(0.5)
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `sample`"
-                " for the UnCLIPScheduler.")
+                " for the UnCLIPScheduler."
+            )
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:
             pred_original_sample = paddle.clip(
                 pred_original_sample,
                 -self.config.clip_sample_range,
-                self.config.clip_sample_range, )
+                self.config.clip_sample_range,
+            )
 
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample_coeff = (alpha_prod_t_prev
-                                      **(0.5) * beta) / beta_prod_t
-        current_sample_coeff = alpha**(0.5) * beta_prod_t_prev / beta_prod_t
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
+        current_sample_coeff = alpha ** (0.5) * beta_prod_t_prev / beta_prod_t
 
         # 5. Compute predicted previous sample µ_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_prev_sample = (pred_original_sample_coeff * pred_original_sample +
-                            current_sample_coeff * sample)
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
 
         # 6. Add noise
         variance = 0
@@ -298,12 +285,14 @@ def step(
             variance_noise = randn_tensor(
                 model_output.shape,
                 dtype=model_output.dtype,
-                generator=generator, )
+                generator=generator,
+            )
 
             variance = self._get_variance(
                 t,
                 predicted_variance=predicted_variance,
-                prev_timestep=prev_timestep, )
+                prev_timestep=prev_timestep,
+            )
 
             if self.variance_type == "fixed_small_log":
                 variance = variance
@@ -312,15 +301,14 @@ def step(
             else:
                 raise ValueError(
                     f"variance_type given as {self.variance_type} must be one of `fixed_small_log` or `learned_range`"
-                    " for the UnCLIPScheduler.")
+                    " for the UnCLIPScheduler."
+                )
 
             variance = variance * variance_noise
 
         pred_prev_sample = pred_prev_sample + variance
 
         if not return_dict:
-            return (pred_prev_sample, )
+            return (pred_prev_sample,)
 
-        return UnCLIPSchedulerOutput(
-            prev_sample=pred_prev_sample,
-            pred_original_sample=pred_original_sample)
+        return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
index 4fb50fb0e19c2..fa85c31efc8c1 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
@@ -23,8 +23,7 @@
 import paddle
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin,
-                               SchedulerOutput)
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
@@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
 
     def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -126,40 +125,43 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int=1000,
-            beta_start: float=0.0001,
-            beta_end: float=0.02,
-            beta_schedule: str="linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]]=None,
-            solver_order: int=2,
-            prediction_type: str="epsilon",
-            thresholding: bool=False,
-            dynamic_thresholding_ratio: float=0.995,
-            sample_max_value: float=1.0,
-            predict_x0: bool=True,
-            solver_type: str="bh2",
-            lower_order_final: bool=True,
-            disable_corrector: List[int]=[],
-            solver_p: SchedulerMixin=None, ):
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: SchedulerMixin = None,
+    ):
         if trained_betas is not None:
             self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
         elif beta_schedule == "linear":
-            self.betas = paddle.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (paddle.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_train_timesteps,
-                dtype=paddle.float32, )**2)
+            self.betas = (
+                paddle.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=paddle.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
@@ -175,16 +177,12 @@ def __init__(
             if solver_type in ["midpoint", "heun", "logrho"]:
                 self.register_to_config(solver_type="bh1")
             else:
-                raise NotImplementedError(
-                    f"{solver_type} does is not implemented for {self.__class__}"
-                )
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
 
         self.predict_x0 = predict_x0
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps,
-            dtype=np.float32)[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
         self.timesteps = paddle.to_tensor(timesteps)
         self.model_outputs = [None] * solver_order
         self.timestep_list = [None] * solver_order
@@ -201,9 +199,12 @@ def set_timesteps(self, num_inference_steps: int):
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
-        timesteps = (np.linspace(0, self.config.num_train_timesteps - 1,
-                                 num_inference_steps + 1).round()[::-1][:-1]
-                     .copy().astype(np.int64))
+        timesteps = (
+            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int64)
+        )
 
         # when num_inference_steps == num_train_timesteps, we can end up with
         # duplicates in timesteps.
@@ -214,7 +215,9 @@ def set_timesteps(self, num_inference_steps: int):
 
         self.num_inference_steps = len(timesteps)
 
-        self.model_outputs = [None, ] * self.config.solver_order
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
         self.lower_order_nums = 0
         self.last_sample = None
         if self.solver_p:
@@ -242,8 +245,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
-        s = paddle.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
+        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
         # paddle.clip donot support min > max
         if self.config.sample_max_value < 1:
             s = paddle.ones_like(s) * self.config.sample_max_value
@@ -251,21 +253,15 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
             s = paddle.clip(
                 s, min=1, max=self.config.sample_max_value
             )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = (
-            paddle.clip(sample, -s, s) /
-            s)  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
+        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = paddle.reshape(sample, [batch_size, channels, height, width])
         sample = paddle.cast(sample, dtype)
 
         return sample
 
-    def convert_model_output(self,
-                             model_output: paddle.Tensor,
-                             timestep: int,
-                             sample: paddle.Tensor) -> paddle.Tensor:
+    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
         r"""
         Convert the model output to the corresponding type that the algorithm PC needs.
 
@@ -280,19 +276,18 @@ def convert_model_output(self,
         """
         if self.predict_x0:
             if self.config.prediction_type == "epsilon":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
             elif self.config.prediction_type == "sample":
                 x0_pred = model_output
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = alpha_t * sample - sigma_t * model_output
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the UniPCMultistepScheduler.")
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
 
             if self.config.thresholding:
                 x0_pred = self._threshold_sample(x0_pred)
@@ -302,26 +297,26 @@ def convert_model_output(self,
             if self.config.prediction_type == "epsilon":
                 return model_output
             elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[
-                    timestep]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = alpha_t * model_output + sigma_t * sample
                 return epsilon
             else:
                 raise ValueError(
                     f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the UniPCMultistepScheduler.")
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
 
     def multistep_uni_p_bh_update(
-            self,
-            model_output: paddle.Tensor,
-            prev_timestep: int,
-            sample: paddle.Tensor,
-            order: int, ) -> paddle.Tensor:
+        self,
+        model_output: paddle.Tensor,
+        prev_timestep: int,
+        sample: paddle.Tensor,
+        order: int,
+    ) -> paddle.Tensor:
         """
         One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
 
@@ -424,12 +419,13 @@ def multistep_uni_p_bh_update(
         return x_t
 
     def multistep_uni_c_bh_update(
-            self,
-            this_model_output: paddle.Tensor,
-            this_timestep: int,
-            last_sample: paddle.Tensor,
-            this_sample: paddle.Tensor,
-            order: int, ) -> paddle.Tensor:
+        self,
+        this_model_output: paddle.Tensor,
+        this_timestep: int,
+        last_sample: paddle.Tensor,
+        this_sample: paddle.Tensor,
+        order: int,
+    ) -> paddle.Tensor:
         """
         One step for the UniC (B(h) version).
 
@@ -512,8 +508,7 @@ def multistep_uni_c_bh_update(
         if self.predict_x0:
             x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
             if D1s is not None:
-                corr_res = paddle.einsum("k,bkchw->bchw",
-                                         rhos_c[:-1].squeeze(1), D1s)
+                corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s)
             else:
                 corr_res = 0
             D1_t = model_t - m0
@@ -521,8 +516,7 @@ def multistep_uni_c_bh_update(
         else:
             x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
             if D1s is not None:
-                corr_res = paddle.einsum("k,bkchw->bchw",
-                                         rhos_c[:-1].squeeze(1), D1s)
+                corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s)
             else:
                 corr_res = 0
             D1_t = model_t - m0
@@ -531,11 +525,12 @@ def multistep_uni_c_bh_update(
         return x_t
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: int,
-            sample: paddle.Tensor,
-            return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: int,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
         """
         Step function propagating the sample with the multistep UniPC.
 
@@ -563,23 +558,22 @@ def step(
         else:
             step_index = step_index.item()
 
-        use_corrector = (step_index > 0 and
-                         step_index - 1 not in self.disable_corrector and
-                         self.last_sample is not None)
+        use_corrector = (
+            step_index > 0 and step_index - 1 not in self.disable_corrector and self.last_sample is not None
+        )
 
-        model_output_convert = self.convert_model_output(model_output, timestep,
-                                                         sample)
+        model_output_convert = self.convert_model_output(model_output, timestep, sample)
         if use_corrector:
             sample = self.multistep_uni_c_bh_update(
                 this_model_output=model_output_convert,
                 this_timestep=timestep,
                 last_sample=self.last_sample,
                 this_sample=sample,
-                order=self.this_order, )
+                order=self.this_order,
+            )
 
         # now prepare to run the predictor
-        prev_timestep = (0 if step_index == len(self.timesteps) - 1 else
-                         self.timesteps[step_index + 1])
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
 
         for i in range(self.config.solver_order - 1):
             self.model_outputs[i] = self.model_outputs[i + 1]
@@ -589,13 +583,11 @@ def step(
         self.timestep_list[-1] = timestep
 
         if self.config.lower_order_final:
-            this_order = min(self.config.solver_order,
-                             len(self.timesteps) - step_index)
+            this_order = min(self.config.solver_order, len(self.timesteps) - step_index)
         else:
             this_order = self.config.solver_order
 
-        self.this_order = min(this_order,
-                              self.lower_order_nums + 1)  # warmup for multistep
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
         assert self.this_order > 0
 
         self.last_sample = sample
@@ -603,18 +595,18 @@ def step(
             model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
             prev_timestep=prev_timestep,
             sample=sample,
-            order=self.this_order, )
+            order=self.this_order,
+        )
 
         if self.lower_order_nums < self.config.solver_order:
             self.lower_order_nums += 1
 
         if not return_dict:
-            return (prev_sample, )
+            return (prev_sample,)
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: paddle.Tensor, *args,
-                          **kwargs) -> paddle.Tensor:
+    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -629,26 +621,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args,
 
     # Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
     def add_noise(
-            self,
-            original_samples: paddle.Tensor,
-            noise: paddle.Tensor,
-            timesteps: paddle.Tensor, ) -> paddle.Tensor:
+        self,
+        original_samples: paddle.Tensor,
+        noise: paddle.Tensor,
+        timesteps: paddle.Tensor,
+    ) -> paddle.Tensor:
         # Make sure alphas_cumprod and timestep have same dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
 
-        sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
         while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
             sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
 
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(
-                original_samples.shape):
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (sqrt_alpha_prod * original_samples +
-                         sqrt_one_minus_alpha_prod * noise)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
     def __len__(self):
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
index 96707f403e49a..d5bcdca0d1a9f 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
@@ -76,11 +76,12 @@ class SchedulerMixin:
 
     @classmethod
     def from_pretrained(
-            cls,
-            pretrained_model_name_or_path: Dict[str, Any]=None,
-            subfolder: Optional[str]=None,
-            return_unused_kwargs: bool=False,
-            **kwargs, ):
+        cls,
+        pretrained_model_name_or_path: Dict[str, Any] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs: bool = False,
+        **kwargs,
+    ):
         r"""
         Instantiate a Scheduler class from a pre-defined JSON configuration file inside a directory or Hub repo.
 
@@ -142,15 +143,16 @@ def from_pretrained(
             subfolder=subfolder,
             return_unused_kwargs=True,
             return_commit_hash=True,
-            **kwargs, )
-        return cls.from_config(
-            config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+            **kwargs,
+        )
+        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
 
     def save_pretrained(
-            self,
-            save_directory: Union[str, os.PathLike],
-            push_to_hub: bool=False,
-            **kwargs, ):
+        self,
+        save_directory: Union[str, os.PathLike],
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
         """
         Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
         [`~SchedulerMixin.from_pretrained`] class method.
@@ -159,8 +161,7 @@ def save_pretrained(
             save_directory (`str` or `os.PathLike`):
                 Directory where the configuration JSON file will be saved (will be created if it does not exist).
         """
-        self.save_config(
-            save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
 
     @property
     def compatibles(self):
@@ -177,7 +178,6 @@ def _get_compatibles(cls):
         compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
         diffusers_library = importlib.import_module(__name__.split(".")[0])
         compatible_classes = [
-            getattr(diffusers_library, c) for c in compatible_classes_str
-            if hasattr(diffusers_library, c)
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
         ]
         return compatible_classes
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
index 71ee1bc4ad4e8..f9f3c34bba785 100644
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
+++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
@@ -69,8 +69,7 @@ def index_to_log_onehot(x: paddle.Tensor, num_classes: int) -> paddle.Tensor:
     return log_x
 
 
-def gumbel_noised(logits: paddle.Tensor,
-                  generator: Optional[paddle.Generator]) -> paddle.Tensor:
+def gumbel_noised(logits: paddle.Tensor, generator: Optional[paddle.Generator]) -> paddle.Tensor:
     """
     Apply gumbel noise to `logits`
     """
@@ -80,34 +79,32 @@ def gumbel_noised(logits: paddle.Tensor,
     return noised
 
 
-def alpha_schedules(num_diffusion_timesteps: int,
-                    alpha_cum_start=0.99999,
-                    alpha_cum_end=0.000009):
+def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
     """
     Cumulative and non-cumulative alpha schedules.
 
     See section 4.1.
     """
-    att = (np.arange(0, num_diffusion_timesteps) /
-           (num_diffusion_timesteps - 1) *
-           (alpha_cum_end - alpha_cum_start) + alpha_cum_start)
+    att = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
+        + alpha_cum_start
+    )
     att = np.concatenate(([1], att))
     at = att[1:] / att[:-1]
     att = np.concatenate((att[1:], [1]))
     return at, att
 
 
-def gamma_schedules(num_diffusion_timesteps: int,
-                    gamma_cum_start=0.000009,
-                    gamma_cum_end=0.99999):
+def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
     """
     Cumulative and non-cumulative gamma schedules.
 
     See section 4.1.
     """
-    ctt = (np.arange(0, num_diffusion_timesteps) /
-           (num_diffusion_timesteps - 1) *
-           (gamma_cum_end - gamma_cum_start) + gamma_cum_start)
+    ctt = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
+        + gamma_cum_start
+    )
     ctt = np.concatenate(([0], ctt))
     one_minus_ctt = 1 - ctt
     one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
@@ -155,13 +152,14 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_vec_classes: int,
-            num_train_timesteps: int=100,
-            alpha_cum_start: float=0.99999,
-            alpha_cum_end: float=0.000009,
-            gamma_cum_start: float=0.000009,
-            gamma_cum_end: float=0.99999, ):
+        self,
+        num_vec_classes: int,
+        num_train_timesteps: int = 100,
+        alpha_cum_start: float = 0.99999,
+        alpha_cum_end: float = 0.000009,
+        gamma_cum_start: float = 0.000009,
+        gamma_cum_end: float = 0.99999,
+    ):
         self.num_embed = num_vec_classes
 
         # By convention, the index for the mask class is the last class index
@@ -170,11 +168,13 @@ def __init__(
         at, att = alpha_schedules(
             num_train_timesteps,
             alpha_cum_start=alpha_cum_start,
-            alpha_cum_end=alpha_cum_end, )
+            alpha_cum_end=alpha_cum_end,
+        )
         ct, ctt = gamma_schedules(
             num_train_timesteps,
             gamma_cum_start=gamma_cum_start,
-            gamma_cum_end=gamma_cum_end, )
+            gamma_cum_end=gamma_cum_end,
+        )
 
         num_non_mask_classes = self.num_embed - 1
         bt = (1 - at - ct) / num_non_mask_classes
@@ -203,8 +203,7 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(
-            np.arange(0, num_train_timesteps)[::-1].copy())
+        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
 
     def set_timesteps(self, num_inference_steps: int):
         """
@@ -219,14 +218,13 @@ def set_timesteps(self, num_inference_steps: int):
         self.timesteps = paddle.to_tensor(timesteps)
 
     def step(
-            self,
-            model_output: paddle.Tensor,
-            timestep: paddle.Tensor,
-            sample: paddle.Tensor,
-            generator: Optional[Union[paddle.Generator, List[
-                paddle.Generator]]]=None,
-            return_dict: bool=True, ) -> Union[VQDiffusionSchedulerOutput,
-                                               Tuple]:
+        self,
+        model_output: paddle.Tensor,
+        timestep: paddle.Tensor,
+        sample: paddle.Tensor,
+        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+        return_dict: bool = True,
+    ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep via the reverse transition distribution i.e. Equation (11). See the
         docstring for `self.q_posterior` for more in depth docs on how Equation (11) is computed.
@@ -263,7 +261,7 @@ def step(
         x_t_min_1 = log_p_x_t_min_1.argmax(axis=1)
 
         if not return_dict:
-            return (x_t_min_1, )
+            return (x_t_min_1,)
 
         return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
 
@@ -299,10 +297,12 @@ def q_posterior(self, log_p_x_0, x_t, t):
         log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
 
         log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
-            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True)
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
+        )
 
         log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
-            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False)
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
+        )
 
         # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0)          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
         #               .                    .                                   .
@@ -384,12 +384,9 @@ def q_posterior(self, log_p_x_0, x_t, t):
         # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
         return log_p_x_t_min_1
 
-    def log_Q_t_transitioning_to_known_class(self,
-                                             *,
-                                             t: paddle.Tensor,
-                                             x_t: paddle.Tensor,
-                                             log_onehot_x_t: paddle.Tensor,
-                                             cumulative: bool):
+    def log_Q_t_transitioning_to_known_class(
+        self, *, t: paddle.Tensor, x_t: paddle.Tensor, log_onehot_x_t: paddle.Tensor, cumulative: bool
+    ):
         """
         Returns the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
         latent pixel in `x_t`.
@@ -462,9 +459,7 @@ def log_Q_t_transitioning_to_known_class(self,
             #
             # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
             # if x_t is masked
-            log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:,
-                                                                      -1, :].unsqueeze(
-                                                                          1)
+            log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
 
         # `index_to_log_onehot` will add onehot vectors for masked pixels,
         # so the default one hot matrix has one too many rows. See the doc string
@@ -486,14 +481,12 @@ def log_Q_t_transitioning_to_known_class(self,
 
         # The whole column of each masked pixel is `c`
         mask_class_mask = x_t == self.mask_class
-        mask_class_mask = mask_class_mask.unsqueeze(1).expand(
-            [-1, self.num_embed - 1, -1])
+        mask_class_mask = mask_class_mask.unsqueeze(1).expand([-1, self.num_embed - 1, -1])
         # log_Q_t[mask_class_mask] = c
         log_Q_t = paddle.where(mask_class_mask, c, log_Q_t)
 
         if not cumulative:
-            log_Q_t = paddle.concat(
-                (log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1)
+            log_Q_t = paddle.concat((log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1)
 
         return log_Q_t
 
diff --git a/ppdiffusers/ppdiffusers/training_utils.py b/ppdiffusers/ppdiffusers/training_utils.py
index dba0703882e22..32a8251578ea7 100644
--- a/ppdiffusers/ppdiffusers/training_utils.py
+++ b/ppdiffusers/ppdiffusers/training_utils.py
@@ -67,17 +67,18 @@ class EMAModel:
     """
 
     def __init__(
-            self,
-            parameters,
-            decay: float=0.9999,
-            min_decay: float=0.0,
-            update_after_step: int=0,
-            use_ema_warmup: bool=False,
-            inv_gamma: Union[float, int]=1.0,
-            power: Union[float, int]=2 / 3,
-            model_cls: Optional[Any]=None,
-            model_config: Dict[str, Any]=None,
-            **kwargs, ):
+        self,
+        parameters,
+        decay: float = 0.9999,
+        min_decay: float = 0.0,
+        update_after_step: int = 0,
+        use_ema_warmup: bool = False,
+        inv_gamma: Union[float, int] = 1.0,
+        power: Union[float, int] = 2 / 3,
+        model_cls: Optional[Any] = None,
+        model_config: Dict[str, Any] = None,
+        **kwargs,
+    ):
         """
         Args:
             parameters (Iterable[nn.Parameter]): The parameters to track.
@@ -99,39 +100,35 @@ def __init__(
         if isinstance(parameters, nn.Layer):
             deprecation_message = (
                 "Passing a `nn.Layer` to `ExponentialMovingAverage` is deprecated. "
-                "Please pass the parameters of the module instead.")
+                "Please pass the parameters of the module instead."
+            )
             deprecate(
                 "passing a `nn.Layer` to `ExponentialMovingAverage`",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             parameters = parameters.parameters()
 
             # set use_ema_warmup to True if a nn.Layer is passed for backwards compatibility
             use_ema_warmup = True
 
         if kwargs.get("max_value", None) is not None:
-            deprecation_message = (
-                "The `max_value` argument is deprecated. Please use `decay` instead."
-            )
-            deprecate(
-                "max_value", "1.0.0", deprecation_message, standard_warn=False)
+            deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead."
+            deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False)
             decay = kwargs["max_value"]
 
         if kwargs.get("min_value", None) is not None:
             deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead."
-            deprecate(
-                "min_value", "1.0.0", deprecation_message, standard_warn=False)
+            deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False)
             min_decay = kwargs["min_value"]
 
         parameters = list(parameters)
         self.shadow_params = [p.clone().detach() for p in parameters]
 
         if kwargs.get("device", None) is not None:
-            deprecation_message = (
-                "The `device` argument is deprecated. Please use `to` instead.")
-            deprecate(
-                "device", "1.0.0", deprecation_message, standard_warn=False)
+            deprecation_message = "The `device` argument is deprecated. Please use `to` instead."
+            deprecate("device", "1.0.0", deprecation_message, standard_warn=False)
             self.to(device=kwargs["device"])
 
         self.temp_stored_params = None
@@ -153,23 +150,17 @@ def from_pretrained(cls, path, model_cls) -> "EMAModel":
         _, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True)
         model = model_cls.from_pretrained(path)
 
-        ema_model = cls(model.parameters(),
-                        model_cls=model_cls,
-                        model_config=model.config)
+        ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config)
 
         ema_model.load_state_dict(ema_kwargs)
         return ema_model
 
     def save_pretrained(self, path):
         if self.model_cls is None:
-            raise ValueError(
-                "`save_pretrained` can only be used if `model_cls` was defined at __init__."
-            )
+            raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
 
         if self.model_config is None:
-            raise ValueError(
-                "`save_pretrained` can only be used if `model_config` was defined at __init__."
-            )
+            raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
 
         model = self.model_cls.from_config(self.model_config)
         state_dict = self.state_dict()
@@ -190,7 +181,7 @@ def get_decay(self, optimization_step: int) -> float:
             return 0.0
 
         if self.use_ema_warmup:
-            cur_decay_value = 1 - (1 + step / self.inv_gamma)**-self.power
+            cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
         else:
             cur_decay_value = (1 + step) / (10 + step)
 
@@ -204,12 +195,14 @@ def step(self, parameters):
         if isinstance(parameters, nn.Layer):
             deprecation_message = (
                 "Passing a `nn.Layer` to `ExponentialMovingAverage.step` is deprecated. "
-                "Please pass the parameters of the module instead.")
+                "Please pass the parameters of the module instead."
+            )
             deprecate(
                 "passing a `nn.Layer` to `ExponentialMovingAverage.step`",
                 "1.0.0",
                 deprecation_message,
-                standard_warn=False, )
+                standard_warn=False,
+            )
             parameters = parameters.parameters()
 
         parameters = list(parameters)
@@ -223,8 +216,7 @@ def step(self, parameters):
 
         for s_param, param in zip(self.shadow_params, parameters):
             if not param.stop_gradient:
-                s_param.copy_(s_param - one_minus_decay * (s_param - param),
-                              True)
+                s_param.copy_(s_param - one_minus_decay * (s_param - param), True)
             else:
                 s_param.copy_(param, True)
 
@@ -267,9 +259,7 @@ def store(self, parameters) -> None:
             parameters: Iterable of `nn.Parameter`; the parameters to be
                 temporarily stored.
         """
-        self.temp_stored_params = [
-            param.detach().cpu().clone() for param in parameters
-        ]
+        self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
 
     def restore(self, parameters) -> None:
         r"""
@@ -282,9 +272,7 @@ def restore(self, parameters) -> None:
                 `ExponentialMovingAverage` was initialized will be used.
         """
         if self.temp_stored_params is None:
-            raise RuntimeError(
-                "This ExponentialMovingAverage has no `store()`ed weights "
-                "to `restore()`")
+            raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`")
         for c_param, param in zip(self.temp_stored_params, parameters):
             param.copy_(c_param, True)
 
@@ -310,18 +298,15 @@ def load_state_dict(self, state_dict: dict) -> None:
         if not isinstance(self.min_decay, float):
             raise ValueError("Invalid min_decay")
 
-        self.optimization_step = state_dict.get("optimization_step",
-                                                self.optimization_step)
+        self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
         if not isinstance(self.optimization_step, int):
             raise ValueError("Invalid optimization_step")
 
-        self.update_after_step = state_dict.get("update_after_step",
-                                                self.update_after_step)
+        self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
         if not isinstance(self.update_after_step, int):
             raise ValueError("Invalid update_after_step")
 
-        self.use_ema_warmup = state_dict.get("use_ema_warmup",
-                                             self.use_ema_warmup)
+        self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
         if not isinstance(self.use_ema_warmup, bool):
             raise ValueError("Invalid use_ema_warmup")
 
@@ -338,8 +323,7 @@ def load_state_dict(self, state_dict: dict) -> None:
             self.shadow_params = shadow_params
             if not isinstance(self.shadow_params, list):
                 raise ValueError("shadow_params must be a list")
-            if not all(
-                    isinstance(p, paddle.Tensor) for p in self.shadow_params):
+            if not all(isinstance(p, paddle.Tensor) for p in self.shadow_params):
                 raise ValueError("shadow_params must all be Tensors")
 
 
@@ -353,17 +337,13 @@ def main_process_first(desc="work"):
         try:
             if not is_main_process:
                 # tell all replicas to wait
-                logger.debug(
-                    f"{rank}: waiting for the {main_process_desc} to perform {desc}"
-                )
+                logger.debug(f"{rank}: waiting for the {main_process_desc} to perform {desc}")
                 paddle.distributed.barrier()
             yield
         finally:
             if is_main_process:
                 # the wait is over
-                logger.debug(
-                    f"{rank}: {main_process_desc} completed {desc}, releasing all replicas"
-                )
+                logger.debug(f"{rank}: {main_process_desc} completed {desc}, releasing all replicas")
                 paddle.distributed.barrier()
     else:
         yield
diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py
index 93a62dd290d7b..4b5b8ba7e4234 100644
--- a/ppdiffusers/ppdiffusers/utils/__init__.py
+++ b/ppdiffusers/ppdiffusers/utils/__init__.py
@@ -20,33 +20,78 @@
 from ..version import VERSION as __version__
 from . import initializer_utils
 from .constants import (
-    CONFIG_NAME, DEPRECATED_REVISION_ARGS, DIFFUSERS_CACHE, DOWNLOAD_SERVER,
-    FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME, FLAX_WEIGHTS_NAME,
-    FROM_DIFFUSERS, FROM_HF_HUB, HF_MODULES_CACHE,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT, LOW_CPU_MEM_USAGE_DEFAULT, NEG_INF,
-    ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PADDLE_WEIGHTS_NAME,
-    PPDIFFUSERS_CACHE, PPDIFFUSERS_DYNAMIC_MODULE_NAME,
-    PPDIFFUSERS_MODULES_CACHE, PPNLP_BOS_RESOLVE_ENDPOINT, TEST_DOWNLOAD_SERVER,
-    TEXT_ENCODER_ATTN_MODULE, TO_DIFFUSERS, TORCH_SAFETENSORS_WEIGHTS_NAME,
-    TORCH_WEIGHTS_NAME, WEIGHTS_NAME, get_map_location_default, str2bool)
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    DIFFUSERS_CACHE,
+    DOWNLOAD_SERVER,
+    FASTDEPLOY_MODEL_NAME,
+    FASTDEPLOY_WEIGHTS_NAME,
+    FLAX_WEIGHTS_NAME,
+    FROM_DIFFUSERS,
+    FROM_HF_HUB,
+    HF_MODULES_CACHE,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    LOW_CPU_MEM_USAGE_DEFAULT,
+    NEG_INF,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
+    PADDLE_WEIGHTS_NAME,
+    PPDIFFUSERS_CACHE,
+    PPDIFFUSERS_DYNAMIC_MODULE_NAME,
+    PPDIFFUSERS_MODULES_CACHE,
+    PPNLP_BOS_RESOLVE_ENDPOINT,
+    TEST_DOWNLOAD_SERVER,
+    TEXT_ENCODER_ATTN_MODULE,
+    TO_DIFFUSERS,
+    TORCH_SAFETENSORS_WEIGHTS_NAME,
+    TORCH_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    get_map_location_default,
+    str2bool,
+)
 from .deprecation_utils import deprecate
 from .doc_utils import replace_example_docstring
-from .download_utils import (_add_variant, _get_model_file, bos_hf_download,
-                             ppdiffusers_bos_dir_download,
-                             ppdiffusers_url_download)
+from .download_utils import (
+    _add_variant,
+    _get_model_file,
+    bos_hf_download,
+    ppdiffusers_bos_dir_download,
+    ppdiffusers_url_download,
+)
 from .dynamic_modules_utils import get_class_from_dynamic_module
 from .hub_utils import HF_HUB_OFFLINE, extract_commit_hash, http_user_agent
 from .import_utils import (
-    BACKENDS_MAPPING, ENV_VARS_TRUE_AND_AUTO_VALUES, ENV_VARS_TRUE_VALUES,
-    DummyObject, OptionalDependencyNotAvailable, is_bs4_available,
-    is_einops_available, is_fastdeploy_available, is_ftfy_available,
-    is_inflect_available, is_k_diffusion_available, is_k_diffusion_version,
-    is_librosa_available, is_note_seq_available, is_omegaconf_available,
-    is_paddle_available, is_paddle_version, is_paddlenlp_available,
-    is_paddlenlp_version, is_ppxformers_available, is_safetensors_available,
-    is_scipy_available, is_tensorboard_available, is_torch_available,
-    is_torch_version, is_unidecode_available, is_visualdl_available,
-    is_wandb_available, requires_backends)
+    BACKENDS_MAPPING,
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    DummyObject,
+    OptionalDependencyNotAvailable,
+    is_bs4_available,
+    is_einops_available,
+    is_fastdeploy_available,
+    is_ftfy_available,
+    is_inflect_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_librosa_available,
+    is_note_seq_available,
+    is_omegaconf_available,
+    is_paddle_available,
+    is_paddle_version,
+    is_paddlenlp_available,
+    is_paddlenlp_version,
+    is_ppxformers_available,
+    is_safetensors_available,
+    is_scipy_available,
+    is_tensorboard_available,
+    is_torch_available,
+    is_torch_version,
+    is_unidecode_available,
+    is_visualdl_available,
+    is_wandb_available,
+    requires_backends,
+)
+
 # custom load_utils
 from .load_utils import is_torch_file, safetensors_load, smart_load, torch_load
 from .logging import get_logger
@@ -56,9 +101,21 @@
 
 if is_paddle_available():
     from .testing_utils import (
-        floats_tensor, image_grid, load_hf_numpy, load_image, load_numpy,
-        load_pd, load_ppnlp_numpy, nightly, paddle_all_close, paddle_device,
-        parse_flag_from_env, print_tensor_test, require_paddle_gpu, slow)
+        floats_tensor,
+        image_grid,
+        load_hf_numpy,
+        load_image,
+        load_numpy,
+        load_pd,
+        load_ppnlp_numpy,
+        nightly,
+        paddle_all_close,
+        paddle_device,
+        parse_flag_from_env,
+        print_tensor_test,
+        require_paddle_gpu,
+        slow,
+    )
 
 if is_torch_available():
     from .testing_utils import require_torch
diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py
index 2a112f725dc0c..2e51e9e559395 100644
--- a/ppdiffusers/ppdiffusers/utils/constants.py
+++ b/ppdiffusers/ppdiffusers/utils/constants.py
@@ -31,9 +31,8 @@ def str2bool(v):
 
 
 ppnlp_cache_home = os.path.expanduser(
-    os.getenv("PPNLP_HOME",
-              os.path.join(
-                  os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp")))
+    os.getenv("PPNLP_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp"))
+)
 
 ppdiffusers_default_cache_path = os.path.join(ppnlp_cache_home, "ppdiffusers")
 # diffusers_default_cache_path = os.path.join(HUGGINGFACE_HUB_CACHE, "diffusers")
@@ -51,25 +50,20 @@ def str2bool(v):
 DIFFUSERS_CACHE = diffusers_default_cache_path
 DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
 PPDIFFUSERS_DYNAMIC_MODULE_NAME = "ppdiffusers_modules"
-HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE",
-                             os.path.join(hf_cache_home, "modules"))
-PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE",
-                                      os.path.join(ppnlp_cache_home, "modules"))
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
+PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE", os.path.join(ppnlp_cache_home, "modules"))
 
 PADDLE_WEIGHTS_NAME = "model_state.pdparams"
 FASTDEPLOY_WEIGHTS_NAME = "inference.pdiparams"
 FASTDEPLOY_MODEL_NAME = "inference.pdmodel"
 WEIGHTS_NAME = PADDLE_WEIGHTS_NAME
 
-TEST_DOWNLOAD_SERVER = (
-    "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests")
+TEST_DOWNLOAD_SERVER = "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests"
 DOWNLOAD_SERVER = "https://bj.bcebos.com/paddlenlp/models/community"
-PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT",
-                                       "https://bj.bcebos.com/paddlenlp")
+PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp")
 DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
 TEXT_ENCODER_ATTN_MODULE = ".self_attn"
-LOW_CPU_MEM_USAGE_DEFAULT = str2bool(
-    os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False))
+LOW_CPU_MEM_USAGE_DEFAULT = str2bool(os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False))
 
 NEG_INF = -1e4
 
@@ -87,5 +81,4 @@ def str2bool(v):
     def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         print(x.tolist())
         print(y.tolist())
-        return raw_all_close(
-            x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
+        return raw_all_close(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
diff --git a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
index 8207e2c77d07f..010f89e11386e 100644
--- a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
@@ -21,39 +21,38 @@
 
 
 def deprecate(
-        *args,
-        take_from: Optional[Union[Dict, Any]]=None,
-        standard_warn=True,
-        stacklevel=2, ):
+    *args,
+    take_from: Optional[Union[Dict, Any]] = None,
+    standard_warn=True,
+    stacklevel=2,
+):
     from ..version import VERSION as __version__
 
     deprecated_kwargs = take_from
     values = ()
     if not isinstance(args[0], tuple):
-        args = (args, )
+        args = (args,)
 
     for attribute, version_name, message in args:
-        if version.parse(version.parse(__version__)
-                         .base_version) >= version.parse(version_name):
+        if version.parse(version.parse(__version__).base_version) >= version.parse(version_name):
             raise ValueError(
                 f"The deprecation tuple {(attribute, version_name, message)} should be removed since ppdiffusers'"
-                f" version {__version__} is >= {version_name}")
+                f" version {__version__} is >= {version_name}"
+            )
 
         warning = None
-        if isinstance(deprecated_kwargs,
-                      dict) and attribute in deprecated_kwargs:
-            values += (deprecated_kwargs.pop(attribute), )
+        if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs:
+            values += (deprecated_kwargs.pop(attribute),)
             warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}."
         elif hasattr(deprecated_kwargs, attribute):
-            values += (getattr(deprecated_kwargs, attribute), )
+            values += (getattr(deprecated_kwargs, attribute),)
             warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}."
         elif deprecated_kwargs is None:
             warning = f"`{attribute}` is deprecated and will be removed in version {version_name}."
 
         if warning is not None:
             warning = warning + " " if standard_warn else ""
-            warnings.warn(
-                warning + message, FutureWarning, stacklevel=stacklevel)
+            warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel)
 
     if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
         call_frame = inspect.getouterframes(inspect.currentframe())[1]
@@ -61,9 +60,7 @@ def deprecate(
         line_number = call_frame.lineno
         function = call_frame.function
         key, value = next(iter(deprecated_kwargs.items()))
-        raise TypeError(
-            f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`"
-        )
+        raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`")
 
     if len(values) == 0:
         return
diff --git a/ppdiffusers/ppdiffusers/utils/doc_utils.py b/ppdiffusers/ppdiffusers/utils/doc_utils.py
index c8b3fe1ab24bc..01188c98e9152 100644
--- a/ppdiffusers/ppdiffusers/utils/doc_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/doc_utils.py
@@ -23,8 +23,7 @@ def docstring_decorator(fn):
         func_doc = fn.__doc__
         lines = func_doc.split("\n")
         i = 0
-        while i < len(lines) and re.search(r"^\s*Examples?:\s*$",
-                                           lines[i]) is None:
+        while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
             i += 1
         if i < len(lines):
             lines[i] = example_docstring
@@ -32,7 +31,8 @@ def docstring_decorator(fn):
         else:
             raise ValueError(
                 f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
-                f"current docstring is:\n{func_doc}")
+                f"current docstring is:\n{func_doc}"
+            )
         fn.__doc__ = func_doc
         return fn
 
diff --git a/ppdiffusers/ppdiffusers/utils/download_utils.py b/ppdiffusers/ppdiffusers/utils/download_utils.py
index a65ba335b0282..2ef31e8ba396b 100644
--- a/ppdiffusers/ppdiffusers/utils/download_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/download_utils.py
@@ -28,8 +28,11 @@
 from filelock import FileLock
 from huggingface_hub import hf_hub_download
 from huggingface_hub.file_download import _chmod_and_replace, http_get
-from huggingface_hub.utils import (EntryNotFoundError, RepositoryNotFoundError,
-                                   RevisionNotFoundError)
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
 from huggingface_hub.utils import tqdm as hf_tqdm
 from packaging import version
 from requests import HTTPError
@@ -37,14 +40,18 @@
 from tqdm.contrib.concurrent import thread_map
 
 from ..version import VERSION as __version__
-from .constants import (DEPRECATED_REVISION_ARGS,
-                        HUGGINGFACE_CO_RESOLVE_ENDPOINT, PPDIFFUSERS_CACHE,
-                        PPNLP_BOS_RESOLVE_ENDPOINT,
-                        TORCH_SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME)
+from .constants import (
+    DEPRECATED_REVISION_ARGS,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    PPDIFFUSERS_CACHE,
+    PPNLP_BOS_RESOLVE_ENDPOINT,
+    TORCH_SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+)
 from .logging import get_logger
 
 
-def _add_variant(weights_name: str, variant: Optional[str]=None) -> str:
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
     if variant is not None:
         splits = weights_name.split(".")
         splits = splits[:-1] + [variant] + splits[-1:]
@@ -55,36 +62,34 @@ def _add_variant(weights_name: str, variant: Optional[str]=None) -> str:
 
 # https://github.com/huggingface/diffusers/blob/da2ce1a6b92f48cabe9e9d3944c4ee8b007b2871/src/diffusers/utils/hub_utils.py#L246
 def _get_model_file(
-        pretrained_model_name_or_path,
-        *,
-        weights_name,
-        subfolder,
-        cache_dir,
-        force_download=False,
-        revision=None,
-        proxies=None,
-        resume_download=False,
-        local_files_only=None,
-        use_auth_token=None,
-        user_agent=None,
-        commit_hash=None,
-        file_lock_timeout=-1,
-        from_hf_hub=False, ):
+    pretrained_model_name_or_path,
+    *,
+    weights_name,
+    subfolder,
+    cache_dir,
+    force_download=False,
+    revision=None,
+    proxies=None,
+    resume_download=False,
+    local_files_only=None,
+    use_auth_token=None,
+    user_agent=None,
+    commit_hash=None,
+    file_lock_timeout=-1,
+    from_hf_hub=False,
+):
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
     if os.path.isfile(pretrained_model_name_or_path):
         return pretrained_model_name_or_path
     elif os.path.isdir(pretrained_model_name_or_path):
-        if os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, weights_name)):
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
             # Load from a PyTorch checkpoint
-            model_file = os.path.join(pretrained_model_name_or_path,
-                                      weights_name)
+            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
             return model_file
         elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder,
-                             weights_name)):
-            model_file = os.path.join(pretrained_model_name_or_path, subfolder,
-                                      weights_name)
+            os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+        ):
+            model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
             return model_file
         else:
             raise EnvironmentError(
@@ -105,19 +110,20 @@ def _get_model_file(
             use_auth_token=use_auth_token,
             user_agent=user_agent,
             file_lock_timeout=file_lock_timeout,
-            commit_hash=commit_hash, )
+            commit_hash=commit_hash,
+        )
 
 
 REPO_TYPES = ["model"]
 DEFAULT_REVISION = "main"
 # REPO_ID_SEPARATOR = "--"
 REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
-PPDIFFUSERS_BOS_URL_TEMPLATE = (
-    PPNLP_BOS_RESOLVE_ENDPOINT +
-    "/{repo_type}/community/{repo_id}/{revision}/{filename}")
+PPDIFFUSERS_BOS_URL_TEMPLATE = PPNLP_BOS_RESOLVE_ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}"
 
 ALLOW_PATTERNS_MAPPING = {
-    "scheduler": ["scheduler_config.json", ],
+    "scheduler": [
+        "scheduler_config.json",
+    ],
     "text_encoder": [
         "model_state.pdparams",
         "config.json",
@@ -190,12 +196,13 @@ def _get_model_file(
 
 
 def ppdiffusers_bos_url(
-        repo_id: str,
-        filename: str,
-        *,
-        subfolder: Optional[str]=None,
-        repo_type: Optional[str]=None,
-        revision: Optional[str]=None, ) -> str:
+    repo_id: str,
+    filename: str,
+    *,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+) -> str:
     if subfolder == "":
         subfolder = None
     if subfolder is not None:
@@ -212,9 +219,9 @@ def ppdiffusers_bos_url(
     return PPDIFFUSERS_BOS_URL_TEMPLATE.format(
         repo_type=repo_type,
         repo_id=repo_id,
-        revision=quote(
-            revision, safe=""),
-        filename=quote(filename), ).replace(f"/{DEFAULT_REVISION}/", "/")
+        revision=quote(revision, safe=""),
+        filename=quote(filename),
+    ).replace(f"/{DEFAULT_REVISION}/", "/")
 
 
 def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
@@ -229,16 +236,17 @@ def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
 
 
 def ppdiffusers_bos_download(
-        repo_id: str,
-        filename: str,
-        *,
-        subfolder: Optional[str]=None,
-        repo_type: Optional[str]=None,
-        revision: Optional[str]=None,
-        cache_dir: Union[str, Path, None]=None,
-        force_download: bool=False,
-        resume_download: bool=False,
-        file_lock_timeout: int=-1, ):
+    repo_id: str,
+    filename: str,
+    *,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    cache_dir: Union[str, Path, None] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    file_lock_timeout: int = -1,
+):
     if cache_dir is None:
         cache_dir = PPDIFFUSERS_CACHE
     if revision is None:
@@ -256,12 +264,8 @@ def ppdiffusers_bos_download(
         repo_type = REPO_TYPES[0]
 
     if repo_type not in REPO_TYPES:
-        raise ValueError(
-            f"Invalid repo type: {repo_type}. Accepted repo types are:"
-            f" {str(REPO_TYPES)}")
-    storage_folder = os.path.join(
-        cache_dir, repo_folder_name(
-            repo_id=repo_id, repo_type=repo_type))
+        raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are:" f" {str(REPO_TYPES)}")
+    storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
     os.makedirs(storage_folder, exist_ok=True)
 
     # cross platform transcription of filename, to be used as a local file path.
@@ -275,8 +279,7 @@ def ppdiffusers_bos_download(
     if os.path.exists(pointer_path) and not force_download:
         return pointer_path
 
-    url_to_download = ppdiffusers_bos_url(
-        repo_id, filename, repo_type=repo_type, revision=revision)
+    url_to_download = ppdiffusers_bos_url(repo_id, filename, repo_type=repo_type, revision=revision)
 
     blob_path = os.path.join(storage_folder, filename)
     # Prevent parallel downloads of the same file with a lock.
@@ -312,10 +315,8 @@ def _resumable_file_manager():
                 resume_size = 0
         else:
             temp_file_manager = partial(  # type: ignore
-                tempfile.NamedTemporaryFile,
-                mode="wb",
-                dir=cache_dir,
-                delete=False)
+                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+            )
             resume_size = 0
 
         # Download to temporary file, then copy to cache dir once finished.
@@ -328,7 +329,8 @@ def _resumable_file_manager():
                 temp_file,
                 proxies=None,
                 resume_size=resume_size,
-                headers=None, )
+                headers=None,
+            )
 
         logger.info("storing %s in cache at %s", url_to_download, blob_path)
         _chmod_and_replace(temp_file.name, blob_path)
@@ -341,12 +343,13 @@ def _resumable_file_manager():
 
 
 def ppdiffusers_url_download(
-        url_to_download: str,
-        cache_dir: Union[str, Path, None]=None,
-        filename: Optional[str]=None,
-        force_download: bool=False,
-        resume_download: bool=False,
-        file_lock_timeout: int=-1, ):
+    url_to_download: str,
+    cache_dir: Union[str, Path, None] = None,
+    filename: Optional[str] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    file_lock_timeout: int = -1,
+):
     if cache_dir is None:
         cache_dir = PPDIFFUSERS_CACHE
     if isinstance(cache_dir, Path):
@@ -386,10 +389,8 @@ def _resumable_file_manager():
                 resume_size = 0
         else:
             temp_file_manager = partial(  # type: ignore
-                tempfile.NamedTemporaryFile,
-                mode="wb",
-                dir=cache_dir,
-                delete=False)
+                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+            )
             resume_size = 0
 
         # Download to temporary file, then copy to cache dir once finished.
@@ -402,7 +403,8 @@ def _resumable_file_manager():
                 temp_file,
                 proxies=None,
                 resume_size=resume_size,
-                headers=None, )
+                headers=None,
+            )
 
         logger.info("storing %s in cache at %s", url_to_download, file_path)
         _chmod_and_replace(temp_file.name, file_path)
@@ -414,28 +416,29 @@ def _resumable_file_manager():
 
 
 def bos_hf_download(
-        pretrained_model_name_or_path,
-        *,
-        filename,
-        subfolder,
-        cache_dir,
-        force_download=False,
-        revision=None,
-        from_hf_hub=False,
-        proxies=None,
-        resume_download=False,
-        local_files_only=None,
-        use_auth_token=None,
-        user_agent=None,
-        file_lock_timeout=-1,
-        commit_hash=None, ):
+    pretrained_model_name_or_path,
+    *,
+    filename,
+    subfolder,
+    cache_dir,
+    force_download=False,
+    revision=None,
+    from_hf_hub=False,
+    proxies=None,
+    resume_download=False,
+    local_files_only=None,
+    use_auth_token=None,
+    user_agent=None,
+    file_lock_timeout=-1,
+    commit_hash=None,
+):
     if from_hf_hub:
         # 1. First check if deprecated way of loading from branches is used
-        if (revision in DEPRECATED_REVISION_ARGS and
-            (filename == WEIGHTS_NAME or
-             filename == TORCH_SAFETENSORS_WEIGHTS_NAME) and
-                version.parse(version.parse(__version__).base_version) >=
-                version.parse("0.17.0")):
+        if (
+            revision in DEPRECATED_REVISION_ARGS
+            and (filename == WEIGHTS_NAME or filename == TORCH_SAFETENSORS_WEIGHTS_NAME)
+            and version.parse(version.parse(__version__).base_version) >= version.parse("0.17.0")
+        ):
             try:
                 model_file = hf_hub_download(
                     pretrained_model_name_or_path,
@@ -448,15 +451,18 @@ def bos_hf_download(
                     use_auth_token=use_auth_token,
                     user_agent=user_agent,
                     subfolder=subfolder,
-                    revision=revision or commit_hash, )
+                    revision=revision or commit_hash,
+                )
                 warnings.warn(
                     f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
-                    FutureWarning, )
+                    FutureWarning,
+                )
                 return model_file
             except:  # noqa: E722
                 warnings.warn(
                     f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(filename, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(filename, revision)}' so that the correct variant file can be added.",
-                    FutureWarning, )
+                    FutureWarning,
+                )
         # 2. Load model file as usual
         try:
             model_file = hf_hub_download(
@@ -470,7 +476,8 @@ def bos_hf_download(
                 use_auth_token=use_auth_token,
                 user_agent=user_agent,
                 subfolder=subfolder,
-                revision=revision, )
+                revision=revision,
+            )
             return model_file
 
         except RepositoryNotFoundError:
@@ -478,7 +485,8 @@ def bos_hf_download(
                 f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                 "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
                 "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                "login`.")
+                "login`."
+            )
         except RevisionNotFoundError:
             raise EnvironmentError(
                 f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
@@ -486,9 +494,7 @@ def bos_hf_download(
                 f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
             )
         except EntryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
-            )
+            raise EnvironmentError(f"{pretrained_model_name_or_path} does not appear to have a file named {filename}.")
         except HTTPError as err:
             raise EnvironmentError(
                 f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
@@ -506,7 +512,8 @@ def bos_hf_download(
                 f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                 "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                 f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a file named {filename}")
+                f"containing a file named {filename}"
+            )
         except KeyboardInterrupt:
             raise EnvironmentError(
                 "You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!"
@@ -521,7 +528,8 @@ def bos_hf_download(
                 resume_download=resume_download,
                 subfolder=subfolder,
                 revision=revision,
-                file_lock_timeout=file_lock_timeout, )
+                file_lock_timeout=file_lock_timeout,
+            )
             return model_file
         except HTTPError as err:
             raise EnvironmentError(
@@ -529,13 +537,15 @@ def bos_hf_download(
                 f"There was a specific connection error when trying to load '{pretrained_model_name_or_path}'! "
                 f"We couldn't connect to '{PPNLP_BOS_RESOLVE_ENDPOINT}' to load this model, couldn't find it "
                 f"in the cached files and it looks like '{pretrained_model_name_or_path}' is not the path to a "
-                f"directory containing a file named '{filename}'.")
+                f"directory containing a file named '{filename}'."
+            )
         except EnvironmentError:
             raise EnvironmentError(
                 f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                 f"'{PPNLP_BOS_RESOLVE_ENDPOINT}', make sure you don't have a local directory with the same name. "
                 f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a file named '{filename}'")
+                f"containing a file named '{filename}'"
+            )
         except KeyboardInterrupt:
             raise EnvironmentError(
                 "You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!"
@@ -562,20 +572,21 @@ def url_file_exists(url: str) -> bool:
 
 
 def ppdiffusers_bos_dir_download(
-        repo_id: str,
-        *,
-        revision: Optional[str]=None,
-        repo_type: Optional[str]=None,
-        cache_dir: Union[str, Path, None]=None,
-        force_download: bool=False,
-        resume_download: bool=False,
-        folder_names: Optional[Union[List[str], str]]=None,
-        max_workers: int=1,
-        tqdm_class: Optional[base_tqdm]=None,
-        variant: Optional[str]=None,
-        is_fastdeploy_model: Optional[str]=False,
-        file_lock_timeout: int=-1,
-        local_files_only: bool=False, ) -> str:
+    repo_id: str,
+    *,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    cache_dir: Union[str, Path, None] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    folder_names: Optional[Union[List[str], str]] = None,
+    max_workers: int = 1,
+    tqdm_class: Optional[base_tqdm] = None,
+    variant: Optional[str] = None,
+    is_fastdeploy_model: Optional[str] = False,
+    file_lock_timeout: int = -1,
+    local_files_only: bool = False,
+) -> str:
     # update repo id must end with @fastdeploy
     if is_fastdeploy_model and not repo_id.endswith("@fastdeploy"):
         repo_id = f"{repo_id}@fastdeploy"
@@ -585,12 +596,9 @@ def ppdiffusers_bos_dir_download(
 
     filtered_repo_files = [["model_index.json", None]]
     for subfolder in folder_names:
-        allow_patterns = ALLOW_PATTERNS_MAPPING.get(
-            subfolder, ALLOW_PATTERNS_MAPPING["others"])
+        allow_patterns = ALLOW_PATTERNS_MAPPING.get(subfolder, ALLOW_PATTERNS_MAPPING["others"])
         if is_fastdeploy_model:
-            allow_patterns = [
-                ap for ap in allow_patterns if "pdparams" not in ap
-            ]
+            allow_patterns = [ap for ap in allow_patterns if "pdparams" not in ap]
             allow_patterns.extend(["inference.pdiparams", "inference.pdmodel"])
         for filename in allow_patterns:
             need_to_check_no_variant_file = False
@@ -602,25 +610,31 @@ def ppdiffusers_bos_dir_download(
             url = ppdiffusers_bos_url(
                 repo_id,
                 filename=filename,
-                subfolder=subfolder, )
+                subfolder=subfolder,
+            )
             if url_file_exists(url):
                 # exist file
-                filtered_repo_files.append([
-                    filename,
-                    subfolder,
-                ])
+                filtered_repo_files.append(
+                    [
+                        filename,
+                        subfolder,
+                    ]
+                )
             else:
                 if need_to_check_no_variant_file:
                     url = ppdiffusers_bos_url(
                         repo_id,
                         filename=raw_filename,
-                        subfolder=subfolder, )
+                        subfolder=subfolder,
+                    )
                     if url_file_exists(url):
                         # exist file
-                        filtered_repo_files.append([
-                            raw_filename,
-                            subfolder,
-                        ])
+                        filtered_repo_files.append(
+                            [
+                                raw_filename,
+                                subfolder,
+                            ]
+                        )
 
     def _inner_ppdiffusers_bos_download(repo_file_list):
         filename, _subfolder = repo_file_list
@@ -633,7 +647,8 @@ def _inner_ppdiffusers_bos_download(repo_file_list):
             revision=revision,
             resume_download=resume_download,
             force_download=force_download,
-            file_lock_timeout=file_lock_timeout, )
+            file_lock_timeout=file_lock_timeout,
+        )
 
     thread_map(
         _inner_ppdiffusers_bos_download,
@@ -641,5 +656,6 @@ def _inner_ppdiffusers_bos_download(repo_file_list):
         desc=f"Fetching {len(filtered_repo_files)} files",
         max_workers=max_workers,
         # User can use its own tqdm class or the default one from `huggingface_hub.utils`
-        tqdm_class=tqdm_class or hf_tqdm, )
+        tqdm_class=tqdm_class or hf_tqdm,
+    )
     return os.path.join(cache_dir, repo_id)
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
index cca1dbd1d7d0d..fcbc659ea253c 100644
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
+++ b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
@@ -225,8 +225,7 @@ def get_cosine_schedule_with_warmup(*args, **kwargs):
 
 
 def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup,
-                      ["paddle"])
+    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["paddle"])
 
 
 def get_linear_schedule_with_warmup(*args, **kwargs):
diff --git a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
index c1da547b98964..574a504c2d775 100644
--- a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
@@ -26,14 +26,16 @@
 from typing import Dict, Optional, Union
 from urllib import request
 
-from huggingface_hub import (HfFolder, cached_download, hf_hub_download,
-                             model_info)
+from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info
 
-from . import (PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE,
-               logging)
+from . import PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE, logging
 
-COMMUNITY_PIPELINES_URL = "https://raw.githubusercontent.com/PaddlePaddle/PaddleMIX/{revision}/ppdiffusers/examples/community/{pipeline}.py"
-GITEE_COMMUNITY_PIPELINES_URL = "https://gitee.com/paddlepaddle/PaddleMIX/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py"
+COMMUNITY_PIPELINES_URL = (
+    "https://raw.githubusercontent.com/PaddlePaddle/PaddleMIX/{revision}/ppdiffusers/examples/community/{pipeline}.py"
+)
+GITEE_COMMUNITY_PIPELINES_URL = (
+    "https://gitee.com/paddlepaddle/PaddleMIX/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py"
+)
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -87,11 +89,9 @@ def get_relative_imports(module_file):
         content = f.read()
 
     # Imports of the form `import .xxx`
-    relative_imports = re.findall(
-        "^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
+    relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
     # Imports of the form `from .xxx import yyy`
-    relative_imports += re.findall(
-        "^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
+    relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
     # Unique-ify
     return list(set(relative_imports))
 
@@ -116,9 +116,7 @@ def get_relative_import_files(module_file):
 
         module_path = Path(module_file).parent
         new_import_files = [str(module_path / m) for m in new_imports]
-        new_import_files = [
-            f for f in new_import_files if f not in all_relative_imports
-        ]
+        new_import_files = [f for f in new_import_files if f not in all_relative_imports]
         files_to_check = [f"{f}.py" for f in new_import_files]
 
         no_change = len(new_import_files) == 0
@@ -137,8 +135,7 @@ def check_imports(filename):
     # Imports of the form `import xxx`
     imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
     # Imports of the form `from xxx import yyy`
-    imports += re.findall(
-        "^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
+    imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
     # Only keep the top-level module
     imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
 
@@ -187,29 +184,33 @@ def find_pipeline_class(loaded_module):
 
     pipeline_class = None
     for cls_name, cls in cls_members.items():
-        if (cls_name != DiffusionPipeline.__name__ and
-                issubclass(cls, DiffusionPipeline) and
-                cls.__module__.split(".")[0] != "ppdiffusers"):
+        if (
+            cls_name != DiffusionPipeline.__name__
+            and issubclass(cls, DiffusionPipeline)
+            and cls.__module__.split(".")[0] != "ppdiffusers"
+        ):
             if pipeline_class is not None:
                 raise ValueError(
                     f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
                     f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in"
-                    f" {loaded_module}.")
+                    f" {loaded_module}."
+                )
             pipeline_class = cls
 
     return pipeline_class
 
 
 def get_cached_module_file(
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        module_file: str,
-        cache_dir: Optional[Union[str, os.PathLike]]=None,
-        force_download: bool=False,
-        resume_download: bool=False,
-        proxies: Optional[Dict[str, str]]=None,
-        use_auth_token: Optional[Union[bool, str]]=None,
-        revision: Optional[str]=None,
-        local_files_only: bool=False, ):
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+):
     """
     Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
     Transformers module.
@@ -260,8 +261,7 @@ def get_cached_module_file(
     # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
 
-    module_file_or_url = os.path.join(pretrained_model_name_or_path,
-                                      module_file)
+    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
 
     if os.path.isfile(module_file_or_url):
         resolved_module_file = module_file_or_url
@@ -273,8 +273,7 @@ def get_cached_module_file(
             logger.info(f"Defaulting to main: {revision}.")
 
         # community pipeline on GitHub
-        github_url = COMMUNITY_PIPELINES_URL.format(
-            revision=revision, pipeline=pretrained_model_name_or_path)
+        github_url = COMMUNITY_PIPELINES_URL.format(revision=revision, pipeline=pretrained_model_name_or_path)
         try:
             resolved_module_file = cached_download(
                 github_url,
@@ -283,13 +282,12 @@ def get_cached_module_file(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=False, )
+                use_auth_token=False,
+            )
             submodule = "git"
             module_file = pretrained_model_name_or_path + ".py"
         except EnvironmentError:
-            logger.error(
-                f"Could not locate the {module_file} inside {pretrained_model_name_or_path}."
-            )
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
             raise
     else:
         try:
@@ -302,13 +300,11 @@ def get_cached_module_file(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token, )
-            submodule = os.path.join(
-                "local", "--".join(pretrained_model_name_or_path.split("/")))
-        except EnvironmentError:
-            logger.error(
-                f"Could not locate the {module_file} inside {pretrained_model_name_or_path}."
+                use_auth_token=use_auth_token,
             )
+            submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
+        except EnvironmentError:
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
             raise
 
     # Check we have all the requirements in our environment
@@ -327,7 +323,8 @@ def get_cached_module_file(
             module_needed = f"{module_needed}.py"
             shutil.copy(
                 os.path.join(pretrained_model_name_or_path, module_needed),
-                submodule_path / module_needed, )
+                submodule_path / module_needed,
+            )
     else:
         # Get the commit hash
         # TODO: we will get this info in the etag soon, so retrieve it from there and not here.
@@ -338,8 +335,7 @@ def get_cached_module_file(
         else:
             token = None
 
-        commit_hash = model_info(
-            pretrained_model_name_or_path, revision=revision, token=token).sha
+        commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
 
         # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
         # benefit of versioning.
@@ -361,22 +357,24 @@ def get_cached_module_file(
                     proxies=proxies,
                     use_auth_token=use_auth_token,
                     revision=revision,
-                    local_files_only=local_files_only, )
+                    local_files_only=local_files_only,
+                )
     return os.path.join(full_submodule, module_file)
 
 
 def get_class_from_dynamic_module(
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        module_file: str,
-        class_name: Optional[str]=None,
-        cache_dir: Optional[Union[str, os.PathLike]]=None,
-        force_download: bool=False,
-        resume_download: bool=False,
-        proxies: Optional[Dict[str, str]]=None,
-        use_auth_token: Optional[Union[bool, str]]=None,
-        revision: Optional[str]=None,
-        local_files_only: bool=False,
-        **kwargs, ):
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    class_name: Optional[str] = None,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
     """
     Extracts a class from a module file, present in the local folder or repository of a model.
 
@@ -449,5 +447,6 @@ def get_class_from_dynamic_module(
         proxies=proxies,
         use_auth_token=use_auth_token,
         revision=revision,
-        local_files_only=local_files_only, )
+        local_files_only=local_files_only,
+    )
     return get_class_in_module(class_name, final_module.replace(".py", ""))
diff --git a/ppdiffusers/ppdiffusers/utils/hub_utils.py b/ppdiffusers/ppdiffusers/utils/hub_utils.py
index 391c8099f0b30..8de82f5ab9800 100644
--- a/ppdiffusers/ppdiffusers/utils/hub_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/hub_utils.py
@@ -28,8 +28,14 @@
 from ..version import VERSION as __version__
 from .constants import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT
 from .import_utils import (
-    ENV_VARS_TRUE_VALUES, _fastdeploy_version, _paddle_version, _torch_version,
-    is_fastdeploy_available, is_paddle_available, is_torch_available)
+    ENV_VARS_TRUE_VALUES,
+    _fastdeploy_version,
+    _paddle_version,
+    _torch_version,
+    is_fastdeploy_available,
+    is_paddle_available,
+    is_torch_available,
+)
 from .logging import get_logger
 
 logger = get_logger(__name__)
@@ -37,12 +43,11 @@
 MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md"
 SESSION_ID = uuid4().hex
 HF_HUB_OFFLINE = os.getenv("HF_HUB_OFFLINE", "").upper() in ENV_VARS_TRUE_VALUES
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY",
-                              "").upper() in ENV_VARS_TRUE_VALUES
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
 HUGGINGFACE_CO_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/"
 
 
-def http_user_agent(user_agent: Union[Dict, str, None]=None) -> str:
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
     """
     Formats a user-agent string with basic info about a request.
     """
@@ -65,9 +70,7 @@ def http_user_agent(user_agent: Union[Dict, str, None]=None) -> str:
     return ua
 
 
-def get_full_repo_name(model_id: str,
-                       organization: Optional[str]=None,
-                       token: Optional[str]=None):
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -82,7 +85,8 @@ def create_model_card(args, model_name):
         raise ValueError(
             "Modelcard rendering is based on Jinja templates."
             " Please make sure to have `jinja` installed before using `create_model_card`."
-            " To install it, please run `pip install Jinja2`.")
+            " To install it, please run `pip install Jinja2`."
+        )
 
     if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
         return
@@ -97,41 +101,35 @@ def create_model_card(args, model_name):
             library_name="ppdiffusers",
             tags=[],
             datasets=args.dataset_name,
-            metrics=[], ),
+            metrics=[],
+        ),
         template_path=MODEL_CARD_TEMPLATE_PATH,
         model_name=model_name,
         repo_name=repo_name,
-        dataset_name=args.dataset_name
-        if hasattr(args, "dataset_name") else None,
+        dataset_name=args.dataset_name if hasattr(args, "dataset_name") else None,
         learning_rate=args.learning_rate,
         train_batch_size=args.train_batch_size,
         eval_batch_size=args.eval_batch_size,
         gradient_accumulation_steps=(
-            args.gradient_accumulation_steps
-            if hasattr(args, "gradient_accumulation_steps") else None),
+            args.gradient_accumulation_steps if hasattr(args, "gradient_accumulation_steps") else None
+        ),
         adam_beta1=args.adam_beta1 if hasattr(args, "adam_beta1") else None,
         adam_beta2=args.adam_beta2 if hasattr(args, "adam_beta2") else None,
-        adam_weight_decay=args.adam_weight_decay
-        if hasattr(args, "adam_weight_decay") else None,
-        adam_epsilon=args.adam_epsilon
-        if hasattr(args, "adam_epsilon") else None,
-        lr_scheduler=args.lr_scheduler
-        if hasattr(args, "lr_scheduler") else None,
-        lr_warmup_steps=args.lr_warmup_steps
-        if hasattr(args, "lr_warmup_steps") else None,
-        ema_inv_gamma=args.ema_inv_gamma
-        if hasattr(args, "ema_inv_gamma") else None,
+        adam_weight_decay=args.adam_weight_decay if hasattr(args, "adam_weight_decay") else None,
+        adam_epsilon=args.adam_epsilon if hasattr(args, "adam_epsilon") else None,
+        lr_scheduler=args.lr_scheduler if hasattr(args, "lr_scheduler") else None,
+        lr_warmup_steps=args.lr_warmup_steps if hasattr(args, "lr_warmup_steps") else None,
+        ema_inv_gamma=args.ema_inv_gamma if hasattr(args, "ema_inv_gamma") else None,
         ema_power=args.ema_power if hasattr(args, "ema_power") else None,
-        ema_max_decay=args.ema_max_decay
-        if hasattr(args, "ema_max_decay") else None,
-        mixed_precision=args.mixed_precision, )
+        ema_max_decay=args.ema_max_decay if hasattr(args, "ema_max_decay") else None,
+        mixed_precision=args.mixed_precision,
+    )
 
     card_path = os.path.join(args.output_dir, "README.md")
     model_card.save(card_path)
 
 
-def extract_commit_hash(resolved_file: Optional[str],
-                        commit_hash: Optional[str]=None):
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None):
     """
     Extracts the commit hash from a resolved filename toward a cache file.
     """
@@ -150,14 +148,12 @@ def extract_commit_hash(resolved_file: Optional[str],
 # - Diffusers doesn't use custom environment variables to specify the cache path.
 # - There is no need to migrate the cache format, just move the files to the new location.
 hf_cache_home = os.path.expanduser(
-    os.getenv("HF_HOME",
-              os.path.join(
-                  os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")))
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
 old_diffusers_cache = os.path.join(hf_cache_home, "diffusers")
 
 
-def move_cache(old_cache_dir: Optional[str]=None,
-               new_cache_dir: Optional[str]=None) -> None:
+def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None:
     if new_cache_dir is None:
         new_cache_dir = DIFFUSERS_CACHE
     if old_cache_dir is None:
@@ -168,8 +164,7 @@ def move_cache(old_cache_dir: Optional[str]=None,
     # move file blob by blob
     for old_blob_path in old_cache_dir.glob("**/blobs/*"):
         if old_blob_path.is_file() and not old_blob_path.is_symlink():
-            new_blob_path = new_cache_dir / old_blob_path.relative_to(
-                old_cache_dir)
+            new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir)
             new_blob_path.parent.mkdir(parents=True, exist_ok=True)
             os.replace(old_blob_path, new_blob_path)
             try:
@@ -182,8 +177,7 @@ def move_cache(old_cache_dir: Optional[str]=None,
     # At this point, old_cache_dir contains symlinks to the new cache (it can still be used).
 
 
-cache_version_file = os.path.join(DIFFUSERS_CACHE,
-                                  "version_diffusers_cache.txt")
+cache_version_file = os.path.join(DIFFUSERS_CACHE, "version_diffusers_cache.txt")
 if not os.path.isfile(cache_version_file):
     cache_version = 0
 else:
@@ -194,13 +188,13 @@ def move_cache(old_cache_dir: Optional[str]=None,
         cache_version = 0
 
 if cache_version < 1:
-    old_cache_is_not_empty = (os.path.isdir(old_diffusers_cache) and
-                              len(os.listdir(old_diffusers_cache)) > 0)
+    old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0
     if old_cache_is_not_empty:
         logger.warning(
             "The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your "
             "existing cached models. This is a one-time operation, you can interrupt it or run it "
-            "later by calling `diffusers.utils.hub_utils.move_cache()`.")
+            "later by calling `diffusers.utils.hub_utils.move_cache()`."
+        )
         try:
             move_cache()
         except Exception as e:
@@ -208,7 +202,8 @@ def move_cache(old_cache_dir: Optional[str]=None,
             logger.error(
                 f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
                 "file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole "
-                "message and we will do our best to help.")
+                "message and we will do our best to help."
+            )
 
 if cache_version < 1:
     try:
@@ -218,4 +213,5 @@ def move_cache(old_cache_dir: Optional[str]=None,
     except Exception:
         logger.warning(
             f"There was a problem when trying to write in your cache folder ({DIFFUSERS_CACHE}). Please, ensure "
-            "the directory exists and can be written to.")
+            "the directory exists and can be written to."
+        )
diff --git a/ppdiffusers/ppdiffusers/utils/import_utils.py b/ppdiffusers/ppdiffusers/utils/import_utils.py
index 577f4d0dc3498..cc43592be9962 100644
--- a/ppdiffusers/ppdiffusers/utils/import_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/import_utils.py
@@ -64,8 +64,9 @@
 
         if _paddle_available:
             try:
-                from paddle.incubate.nn.memory_efficient_attention import \
-                    memory_efficient_attention  # noqa
+                from paddle.incubate.nn.memory_efficient_attention import (  # noqa
+                    memory_efficient_attention,
+                )
 
                 _ppxformers_available = True
             except ImportError:
@@ -90,8 +91,7 @@
     if _safetensors_available:
         try:
             _safetensors_version = importlib_metadata.version("safetensors")
-            logger.info(
-                f"Safetensors version {_safetensors_version} available.")
+            logger.info(f"Safetensors version {_safetensors_version} available.")
         except importlib_metadata.PackageNotFoundError:
             _safetensors_available = False
 else:
@@ -101,8 +101,7 @@
 _transformers_available = importlib.util.find_spec("transformers") is not None
 try:
     _transformers_version = importlib_metadata.version("transformers")
-    logger.debug(
-        f"Successfully imported transformers version {_transformers_version}")
+    logger.debug(f"Successfully imported transformers version {_transformers_version}")
 except importlib_metadata.PackageNotFoundError:
     _transformers_available = False
 
@@ -116,8 +115,7 @@
 _unidecode_available = importlib.util.find_spec("unidecode") is not None
 try:
     _unidecode_version = importlib_metadata.version("unidecode")
-    logger.debug(
-        f"Successfully imported unidecode version {_unidecode_version}")
+    logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
 except importlib_metadata.PackageNotFoundError:
     _unidecode_available = False
 
@@ -134,14 +132,12 @@
             pass
     _fastdeploy_available = _fastdeploy_version != "N/A"
     if _fastdeploy_available:
-        logger.debug(
-            f"Successfully imported fastdeploy version {_fastdeploy_version}")
+        logger.debug(f"Successfully imported fastdeploy version {_fastdeploy_version}")
 
 _paddlenlp_available = importlib.util.find_spec("paddlenlp") is not None
 try:
     _paddlenlp_version = importlib_metadata.version("paddlenlp")
-    logger.debug(
-        f"Successfully imported paddlenlp version {_paddlenlp_version}")
+    logger.debug(f"Successfully imported paddlenlp version {_paddlenlp_version}")
 except importlib_metadata.PackageNotFoundError:
     _paddlenlp_available = False
 
@@ -152,7 +148,8 @@
         "opencv-python",
         "opencv-contrib-python",
         "opencv-python-headless",
-        "opencv-contrib-python-headless", )
+        "opencv-contrib-python-headless",
+    )
     _opencv_version = None
     for pkg in candidates:
         try:
@@ -183,8 +180,7 @@
 _k_diffusion_available = importlib.util.find_spec("k_diffusion") is not None
 try:
     _k_diffusion_version = importlib_metadata.version("k_diffusion")
-    logger.debug(
-        f"Successfully imported k-diffusion version {_k_diffusion_version}")
+    logger.debug(f"Successfully imported k-diffusion version {_k_diffusion_version}")
 except importlib_metadata.PackageNotFoundError:
     _k_diffusion_available = False
 
@@ -205,16 +201,14 @@
 _omegaconf_available = importlib.util.find_spec("omegaconf") is not None
 try:
     _omegaconf_version = importlib_metadata.version("omegaconf")
-    logger.debug(
-        f"Successfully imported omegaconf version {_omegaconf_version}")
+    logger.debug(f"Successfully imported omegaconf version {_omegaconf_version}")
 except importlib_metadata.PackageNotFoundError:
     _omegaconf_available = False
 
 _tensorboard_available = importlib.util.find_spec("tensorboard")
 try:
     _tensorboard_version = importlib_metadata.version("tensorboard")
-    logger.debug(
-        f"Successfully imported tensorboard version {_tensorboard_version}")
+    logger.debug(f"Successfully imported tensorboard version {_tensorboard_version}")
 except importlib_metadata.PackageNotFoundError:
     _tensorboard_available = False
 
@@ -232,8 +226,7 @@
         import einops.layers.paddle
 
         einops.layers.paddle
-        logger.debug(
-            f"Successfully imported einops version {einops.__version__}")
+        logger.debug(f"Successfully imported einops version {einops.__version__}")
     except ImportError:
         _einops_available = False
 except importlib_metadata.PackageNotFoundError:
@@ -482,27 +475,29 @@ def is_bs4_available():
 that match your environment. Please note that you may need to restart your runtime after installation.
 """
 
-BACKENDS_MAPPING = OrderedDict([
-    ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
-    ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)),
-    ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
-    ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)),
-    ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)),
-    ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
-    ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
-    ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
-    ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
-    ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
-    ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
-    ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
-    ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
-    ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
-    ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
-    ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)),
-    ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
-    ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
-    ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
-])
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+        ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)),
+        ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
+        ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)),
+        ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)),
+        ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
+        ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
+        ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
+        ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
+        ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
+        ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)),
+        ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
+        ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+    ]
+)
 
 
 def requires_backends(obj, backends):
@@ -516,26 +511,24 @@ def requires_backends(obj, backends):
         raise ImportError("".join(failed))
 
     if name in [
-            "VersatileDiffusionTextToImagePipeline",
-            "VersatileDiffusionPipeline",
-            "VersatileDiffusionDualGuidedPipeline",
-            "StableDiffusionImageVariationPipeline",
-            "UnCLIPPipeline",
+        "VersatileDiffusionTextToImagePipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionDualGuidedPipeline",
+        "StableDiffusionImageVariationPipeline",
+        "UnCLIPPipeline",
     ] and is_paddlenlp_version("<", "2.5.0"):
         raise ImportError(
             f"You need to install `paddlenlp>=2.5.0` in order to use {name}: \n```\n pip install"
-            " --upgrade paddlenlp \n```")
+            " --upgrade paddlenlp \n```"
+        )
 
-    if name in [
-            "StableDiffusionDepth2ImgPipeline",
-            "StableDiffusionPix2PixZeroPipeline",
-    ] and is_paddlenlp_version(
-            "<",
-            "2.5.1"  # TODO version
+    if name in ["StableDiffusionDepth2ImgPipeline", "StableDiffusionPix2PixZeroPipeline"] and is_paddlenlp_version(
+        "<", "2.5.1"  # TODO version
     ):
         raise ImportError(
             f"You need to install `paddlenlp>=2.5.1` in order to use {name}: \n```\n pip install"
-            " --upgrade paddlenlp \n```")
+            " --upgrade paddlenlp \n```"
+        )
 
 
 class DummyObject(type):
@@ -551,9 +544,7 @@ def __getattr__(cls, key):
 
 
 # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
-def compare_versions(library_or_version: Union[str, Version],
-                     operation: str,
-                     requirement_version: str):
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
     """
     Args:
     Compares a library version to some requirement using a given operation.
@@ -565,13 +556,10 @@ def compare_versions(library_or_version: Union[str, Version],
             The version to compare the library version against
     """
     if operation not in STR_OPERATION_TO_FUNC.keys():
-        raise ValueError(
-            f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}"
-        )
+        raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
     operation = STR_OPERATION_TO_FUNC[operation]
     if isinstance(library_or_version, str):
-        library_or_version = parse(
-            importlib_metadata.version(library_or_version))
+        library_or_version = parse(importlib_metadata.version(library_or_version))
     return operation(library_or_version, parse(requirement_version))
 
 
diff --git a/ppdiffusers/ppdiffusers/utils/initializer_utils.py b/ppdiffusers/ppdiffusers/utils/initializer_utils.py
index 9c71cc89861c9..263c7c41a030f 100644
--- a/ppdiffusers/ppdiffusers/utils/initializer_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/initializer_utils.py
@@ -46,9 +46,7 @@ def _no_grad_uniform_(tensor, a, b):
 
 def _no_grad_normal_(tensor, mean=0.0, std=1.0):
     with paddle.no_grad():
-        tensor.copy_(
-            paddle.normal(
-                mean=mean, std=std, shape=tensor.shape), True)
+        tensor.copy_(paddle.normal(mean=mean, std=std, shape=tensor.shape), True)
     return tensor
 
 
@@ -134,9 +132,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False):
         Tuple[fan_in, fan_out]
     """
     if tensor.ndim < 2:
-        raise ValueError(
-            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
-        )
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
 
     if reverse:
         num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
@@ -189,8 +185,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False):
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
-        raise ValueError("Mode {} not supported, please use one of {}".format(
-            mode, valid_modes))
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
 
     fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
 
@@ -216,13 +211,11 @@ def _calculate_gain(nonlinearity, param=None):
     elif nonlinearity == "leaky_relu":
         if param is None:
             negative_slope = 0.01
-        elif (not isinstance(param, bool) and isinstance(param, int) or
-              isinstance(param, float)):
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
             # True/False are instances of int, hence check above
             negative_slope = param
         else:
-            raise ValueError("negative_slope {} not a valid number".format(
-                param))
+            raise ValueError("negative_slope {} not a valid number".format(param))
         return math.sqrt(2.0 / (1 + negative_slope**2))
     elif nonlinearity == "selu":
         return 3.0 / 4
@@ -230,11 +223,7 @@ def _calculate_gain(nonlinearity, param=None):
         raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
 
 
-def kaiming_uniform_(tensor,
-                     a=0,
-                     mode="fan_in",
-                     nonlinearity="leaky_relu",
-                     reverse=False):
+def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
     """
     Modified tensor inspace using kaiming_uniform method
     Args:
@@ -252,11 +241,7 @@ def kaiming_uniform_(tensor,
     return _no_grad_uniform_(tensor, -k, k)
 
 
-def kaiming_normal_(tensor,
-                    a=0,
-                    mode="fan_in",
-                    nonlinearity="leaky_relu",
-                    reverse=False):
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
     """
     Modified tensor inspace using kaiming_normal_
     Args:
@@ -304,8 +289,7 @@ def reset_initialized_parameter(model, include_self=True):
     """
     for _, m in model.named_sublayers(include_self=include_self):
         if isinstance(m, nn.Conv2D):
-            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
-                                    m._kernel_size[1])
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1])
             k = math.sqrt(k)
             _no_grad_uniform_(m.weight, -k, k)
             if hasattr(m, "bias") and getattr(m, "bias") is not None:
@@ -330,17 +314,17 @@ def reset_initialized_parameter(model, include_self=True):
 class Init:
     def __init__(self):
         for init_func in [
-                uniform_,
-                normal_,
-                constant_,
-                ones_,
-                zeros_,
-                xavier_uniform_,
-                xavier_normal_,
-                kaiming_uniform_,
-                kaiming_normal_,
-                linear_init_,
-                conv_init_,
+            uniform_,
+            normal_,
+            constant_,
+            ones_,
+            zeros_,
+            xavier_uniform_,
+            xavier_normal_,
+            kaiming_uniform_,
+            kaiming_normal_,
+            linear_init_,
+            conv_init_,
         ]:
             setattr(self, init_func.__name__, init_func)
 
diff --git a/ppdiffusers/ppdiffusers/utils/load_utils.py b/ppdiffusers/ppdiffusers/utils/load_utils.py
index 023551a27ce6d..a1602c8862d80 100644
--- a/ppdiffusers/ppdiffusers/utils/load_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/load_utils.py
@@ -24,8 +24,11 @@
 import numpy as np
 
 from .constants import get_map_location_default
-from .import_utils import (is_paddle_available, is_safetensors_available,
-                           is_torch_available)
+from .import_utils import (
+    is_paddle_available,
+    is_safetensors_available,
+    is_torch_available,
+)
 from .logging import get_logger
 
 logger = get_logger(__name__)
@@ -68,8 +71,7 @@ def read_prefix_key(path):
     with open(path, "rb") as file_handler:
         end_index = seek_by_string(file_handler, "data.pkl", file_size)
         file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
-        prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE
-                                       - len("/data.pkl"))
+        prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - len("/data.pkl"))
     return prefix_key.decode("latin")
 
 
@@ -89,8 +91,7 @@ def seek_by_string(file_handler, string: str, file_size: int) -> int:
             word_index = 0
 
     if file_handler.tell() >= file_size - 1:
-        raise Exception(
-            f"can't find the find the target string<{string}> in the file")
+        raise Exception(f"can't find the find the target string<{string}> in the file")
     return file_handler.tell()
 
 
@@ -163,21 +164,18 @@ def find_class(self, mod_name, name):
         return super().find_class(mod_name, name)
 
 
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
-                          backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
     # if a tensor has shape [M, N] and stride is [1, N], it's column-wise / fortran-style
     # if a tensor has shape [M, N] and stride is [M, 1], it's row-wise / C-style
     # defautls to C-style
-    if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[
-            1] > 1:
+    if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[1] > 1:
         order = "F"
     else:
         order = "C"
 
     # fix bug when load https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
     numel = int(np.prod(size))
-    return storage[storage_offset:storage_offset + numel].reshape(
-        size, order=order)
+    return storage[storage_offset : storage_offset + numel].reshape(size, order=order)
 
 
 def _rebuild_parameter(data, requires_grad, backward_hooks):
@@ -207,8 +205,7 @@ def torch_load(path: str, **pickle_load_args):
 
         def load_tensor(dtype, numel, key, location):
             name = f"{prefix_key}/data/{key}"
-            typed_storage = np.frombuffer(
-                torch_zip.open(name).read()[:numel], dtype=dtype)
+            typed_storage = np.frombuffer(torch_zip.open(name).read()[:numel], dtype=dtype)
             return typed_storage
 
         def persistent_load(saved_id):
@@ -226,15 +223,13 @@ def persistent_load(saved_id):
                 typed_storage = loaded_storages[key]
             else:
                 nbytes = numel * _element_size(dtype)
-                typed_storage = load_tensor(dtype, nbytes, key,
-                                            _maybe_decode_ascii(location))
+                typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
                 loaded_storages[key] = typed_storage
 
             return typed_storage
 
         data_iostream = torch_zip.open(f"{prefix_key}/data.pkl").read()
-        unpickler_stage = UnpicklerWrapperStage(
-            io.BytesIO(data_iostream), **pickle_load_args)
+        unpickler_stage = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
         unpickler_stage.persistent_load = persistent_load
         state_dict = unpickler_stage.load()
         torch_zip.close()
@@ -263,19 +258,18 @@ def convert_to_paddle(state_dict, return_numpy=False, return_global_step=False):
         # if "position_id" in k and "int" not in str(v.dtype):
         #     v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64")
         if v.ndim == 0:
-            v = v.reshape((1, ))
+            v = v.reshape((1,))
         if not return_numpy:
             # support bfloat16
             if "torch.bfloat16" in str(v.dtype):
                 v = v.float()
                 pd_state_dict[k] = (
                     paddle.to_tensor(v.numpy()).cast(paddle.bfloat16)
-                    if hasattr(v, "numpy") else
-                    paddle.to_tensor(v).cast(paddle.bfloat16))
+                    if hasattr(v, "numpy")
+                    else paddle.to_tensor(v).cast(paddle.bfloat16)
+                )
             else:
-                pd_state_dict[k] = (paddle.to_tensor(v.numpy())
-                                    if hasattr(v, "numpy") else
-                                    paddle.to_tensor(v))
+                pd_state_dict[k] = paddle.to_tensor(v.numpy()) if hasattr(v, "numpy") else paddle.to_tensor(v)
         else:
             pd_state_dict[k] = v.numpy() if hasattr(v, "numpy") else v
 
@@ -290,7 +284,7 @@ def convert_to_numpy(state_dict):
         # if "position_id" in k and "int" not in str(v.dtype):
         #     v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64")
         if v.ndim == 0:
-            v = v.reshape((1, ))
+            v = v.reshape((1,))
     return pd_state_dict
 
 
@@ -310,19 +304,18 @@ def safetensors_load(path: str):
 
             data = load_file(path)
     else:
-        raise ImportError(
-            "`safetensors_load` requires the `safetensors library: `pip install safetensors`."
-        )
+        raise ImportError("`safetensors_load` requires the `safetensors library: `pip install safetensors`.")
 
     return data
 
 
 def smart_load(
-        path: str,
-        map_location: str=None,
-        return_numpy: bool=False,
-        return_global_step: bool=False,
-        return_is_torch_weight: bool=False, ):
+    path: str,
+    map_location: str = None,
+    return_numpy: bool = False,
+    return_global_step: bool = False,
+    return_is_torch_weight: bool = False,
+):
     if map_location is None:
         map_location = get_map_location_default()
 
@@ -335,46 +328,36 @@ def smart_load(
             return state_dict
 
         if suffix in torch_suffix:
-            state_dict = convert_to_paddle(
-                torch_load(path), return_numpy, return_global_step)
+            state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step)
             if return_is_torch_weight:
                 state_dict["is_torch_weight"] = True
             return state_dict
 
         if suffix in safetensors_suffix:
-            state_dict = convert_to_paddle(
-                safetensors_load(path), return_numpy, return_global_step)
+            state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step)
             if return_is_torch_weight:
                 state_dict["is_torch_weight"] = True
             return state_dict
 
         # must use safetensors_load first
         try:
-            state_dict = convert_to_paddle(
-                safetensors_load(path), return_numpy, return_global_step)
+            state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step)
             if return_is_torch_weight:
                 state_dict["is_torch_weight"] = True
             return state_dict
         except Exception:
             logger.info(f"Cant load file {name} with safetensors!")
         try:
-            state_dict = convert_to_paddle(
-                torch_load(path), return_numpy, return_global_step)
+            state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step)
             if return_is_torch_weight:
                 state_dict["is_torch_weight"] = True
             return state_dict
         except Exception:
-            logger.info(
-                f"Cant load file {name} with torch! We will try to load this with safetensors!"
-            )
+            logger.info(f"Cant load file {name} with torch! We will try to load this with safetensors!")
         try:
             state_dict = paddle.load(path, return_numpy=return_numpy)
             return state_dict
         except Exception:
-            logger.info(
-                f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!"
-            )
+            logger.info(f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!")
     if state_dict is None:
-        raise ValueError(
-            f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!"
-        )
+        raise ValueError(f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!")
diff --git a/ppdiffusers/ppdiffusers/utils/logging.py b/ppdiffusers/ppdiffusers/utils/logging.py
index 355cb16bd50bd..12b12c075d2ef 100644
--- a/ppdiffusers/ppdiffusers/utils/logging.py
+++ b/ppdiffusers/ppdiffusers/utils/logging.py
@@ -58,7 +58,8 @@ def _get_default_logging_level():
         else:
             logging.getLogger().warning(
                 f"Unknown option PPDIFFUSERS_VERBOSITY={env_level_str}, "
-                f"has to be one of: { ', '.join(log_levels.keys()) }")
+                f"has to be one of: { ', '.join(log_levels.keys()) }"
+            )
     return _default_log_level
 
 
@@ -104,7 +105,7 @@ def get_log_levels_dict():
     return log_levels
 
 
-def get_logger(name: Optional[str]=None) -> logging.Logger:
+def get_logger(name: Optional[str] = None) -> logging.Logger:
     """
     Return a logger with the specified name.
 
@@ -212,8 +213,7 @@ def remove_handler(handler: logging.Handler) -> None:
 
     _configure_library_root_logger()
 
-    assert handler is not None and handler not in _get_library_root_logger(
-    ).handlers
+    assert handler is not None and handler not in _get_library_root_logger().handlers
     _get_library_root_logger().removeHandler(handler)
 
 
@@ -247,8 +247,7 @@ def enable_explicit_format() -> None:
     handlers = _get_library_root_logger().handlers
 
     for handler in handlers:
-        formatter = logging.Formatter(
-            "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
         handler.setFormatter(formatter)
 
 
diff --git a/ppdiffusers/ppdiffusers/utils/outputs.py b/ppdiffusers/ppdiffusers/utils/outputs.py
index cd319b7378749..b71ef22559c47 100644
--- a/ppdiffusers/ppdiffusers/utils/outputs.py
+++ b/ppdiffusers/ppdiffusers/utils/outputs.py
@@ -60,8 +60,7 @@ def __post_init__(self):
             raise ValueError(f"{self.__class__.__name__} has no fields.")
 
         first_field = getattr(self, class_fields[0].name)
-        other_fields_are_none = all(
-            getattr(self, field.name) is None for field in class_fields[1:])
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
 
         if other_fields_are_none and isinstance(first_field, dict):
             for key, value in first_field.items():
@@ -73,23 +72,16 @@ def __post_init__(self):
                     self[field.name] = v
 
     def __delitem__(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
-        )
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
 
     def setdefault(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
-        )
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
 
     def pop(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
 
     def update(self, *args, **kwargs):
-        raise Exception(
-            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
-        )
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
 
     def __getitem__(self, k):
         if isinstance(k, str):
@@ -121,6 +113,6 @@ def to_tuple(self) -> Tuple[Any]:
         for field in fields(self):
             if getattr(self, field.name, None) is None:
                 continue
-            tuples = tuples + (getattr(self, field.name), )
+            tuples = tuples + (getattr(self, field.name),)
 
         return tuples
diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
index a59bfd24f7166..1fa9da783471b 100644
--- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
@@ -43,8 +43,7 @@ def manual_seed(self, seed, generator_name=None):
             if generator_name is None:
                 generator_name = str(time.time())
             if generator_name in self.states_:
-                raise ValueError("state {} already exists".format(
-                    generator_name))
+                raise ValueError("state {} already exists".format(generator_name))
             orig_rng_state = paddle.get_cuda_rng_state()
             paddle.seed(seed)
             self.states_[generator_name] = paddle.get_cuda_rng_state()
@@ -55,8 +54,7 @@ def manual_seed(self, seed, generator_name=None):
         def rng_state(self, generator_name=None):
             if generator_name is not None:
                 if generator_name not in self.states_:
-                    raise ValueError("state {} does not exist".format(
-                        generator_name))
+                    raise ValueError("state {} does not exist".format(generator_name))
                 orig_cuda_rng_state = paddle.get_cuda_rng_state()
                 paddle.set_cuda_rng_state(self.states_[generator_name])
                 try:
@@ -81,16 +79,13 @@ def get_rng_state_tracker(*args, **kwargs):
     @paddle.jit.not_to_static
     def randn_pt(shape, dtype=None, name=None, **kwargs):
         generator = kwargs.get("generator", None)
-        is_bfloat16 = ("bfloat16" in str(dtype) or
-                       "bfloat16" in paddle.get_default_dtype())
+        is_bfloat16 = "bfloat16" in str(dtype) or "bfloat16" in paddle.get_default_dtype()
         if is_bfloat16:
             if generator is None:
-                return randn(
-                    shape, dtype="float16", name=name).cast(paddle.bfloat16)
+                return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16)
             else:
                 with get_rng_state_tracker().rng_state(generator):
-                    return randn(
-                        shape, dtype="float16", name=name).cast(paddle.bfloat16)
+                    return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16)
         else:
             if generator is None:
                 return randn(shape, dtype=dtype, name=name)
@@ -108,24 +103,20 @@ def rand_pt(shape, dtype=None, name=None, **kwargs):
                 return rand(shape, dtype=dtype, name=name)
 
     @paddle.jit.not_to_static
-    def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None,
-                   **kwargs):
+    def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None, **kwargs):
         generator = kwargs.get("generator", None)
         if generator is None:
-            return randint(
-                low=low, high=high, shape=shape, dtype=dtype, name=name)
+            return randint(low=low, high=high, shape=shape, dtype=dtype, name=name)
         else:
             with get_rng_state_tracker().rng_state(generator):
-                return randint(
-                    low=low, high=high, shape=shape, dtype=dtype, name=name)
+                return randint(low=low, high=high, shape=shape, dtype=dtype, name=name)
 
     @paddle.jit.not_to_static
     def randn_like_pt(x, dtype=None, name=None, **kwargs):
         generator = kwargs.get("generator", None)
         if dtype is None:
             dtype = x.dtype
-        return randn_pt(
-            x.shape, dtype=dtype, generator=generator, name=name, **kwargs)
+        return randn_pt(x.shape, dtype=dtype, generator=generator, name=name, **kwargs)
 
     paddle.randn = randn_pt
     paddle.rand = rand_pt
@@ -133,23 +124,19 @@ def randn_like_pt(x, dtype=None, name=None, **kwargs):
     paddle.randn_like = randn_like_pt
 
     def randn_tensor(
-            shape: Union[Tuple, List],
-            generator: Optional[Union[List["paddle.Generator"],
-                                      "paddle.Generator"]]=None,
-            dtype: Optional["paddle.dtype"]=None,
-            *kwargs, ):
+        shape: Union[Tuple, List],
+        generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None,
+        dtype: Optional["paddle.dtype"] = None,
+        *kwargs,
+    ):
         """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
         passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
         will always be created on CPU.
         """
         if isinstance(generator, (list, tuple)):
             batch_size = shape[0]
-            shape = (1, ) + tuple(shape[1:])
-            latents = [
-                randn_pt(
-                    shape, generator=generator[i], dtype=dtype)
-                for i in range(batch_size)
-            ]
+            shape = (1,) + tuple(shape[1:])
+            latents = [randn_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)]
             latents = paddle.concat(latents, axis=0)
         else:
             latents = randn_pt(shape, generator=generator, dtype=dtype)
@@ -157,23 +144,19 @@ def randn_tensor(
         return latents
 
     def rand_tensor(
-            shape: Union[Tuple, List],
-            generator: Optional[Union[List["paddle.Generator"],
-                                      "paddle.Generator"]]=None,
-            dtype: Optional["paddle.dtype"]=None,
-            *kwargs, ):
+        shape: Union[Tuple, List],
+        generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None,
+        dtype: Optional["paddle.dtype"] = None,
+        *kwargs,
+    ):
         """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
         passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
         will always be created on CPU.
         """
         if isinstance(generator, (list, tuple)):
             batch_size = shape[0]
-            shape = (1, ) + tuple(shape[1:])
-            latents = [
-                rand_pt(
-                    shape, generator=generator[i], dtype=dtype)
-                for i in range(batch_size)
-            ]
+            shape = (1,) + tuple(shape[1:])
+            latents = [rand_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)]
             latents = paddle.concat(latents, axis=0)
         else:
             latents = rand_pt(shape, generator=generator, dtype=dtype)
@@ -181,18 +164,18 @@ def rand_tensor(
         return latents
 
     def randint_tensor(
-            low=0,
-            high=None,
-            shape: Union[Tuple, List]=[1],
-            generator: Optional["paddle.Generator"]=None,
-            dtype: Optional["paddle.dtype"]=None,
-            *kwargs, ):
+        low=0,
+        high=None,
+        shape: Union[Tuple, List] = [1],
+        generator: Optional["paddle.Generator"] = None,
+        dtype: Optional["paddle.dtype"] = None,
+        *kwargs,
+    ):
         """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
         passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
         will always be created on CPU.
         """
-        latents = randint_pt(
-            low=low, high=high, shape=shape, dtype=dtype, generator=generator)
+        latents = randint_pt(low=low, high=high, shape=shape, dtype=dtype, generator=generator)
 
         return latents
 
diff --git a/ppdiffusers/ppdiffusers/utils/pil_utils.py b/ppdiffusers/ppdiffusers/utils/pil_utils.py
index 7d41b9c74c07a..bef4901a7e5f8 100644
--- a/ppdiffusers/ppdiffusers/utils/pil_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/pil_utils.py
@@ -18,8 +18,7 @@
 from packaging import version
 from PIL import Image
 
-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse(
-        "9.1.0"):
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PIL_INTERPOLATION = {
         "linear": PIL.Image.Resampling.BILINEAR,
         "bilinear": PIL.Image.Resampling.BILINEAR,
@@ -60,10 +59,7 @@ def numpy_to_pil(images):
     images = (images * 255).round().astype("uint8")
     if images.shape[-1] == 1:
         # special case for grayscale (single channel) images
-        pil_images = [
-            Image.fromarray(
-                image.squeeze(), mode="L") for image in images
-        ]
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
     else:
         pil_images = [Image.fromarray(image) for image in images]
 
diff --git a/ppdiffusers/ppdiffusers/utils/testing_utils.py b/ppdiffusers/ppdiffusers/utils/testing_utils.py
index 88a8c7e167c47..02e03ca1e944a 100644
--- a/ppdiffusers/ppdiffusers/utils/testing_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/testing_utils.py
@@ -31,10 +31,16 @@
 import PIL.ImageOps
 import requests
 
-from .import_utils import (BACKENDS_MAPPING, is_compel_available,
-                           is_fastdeploy_available, is_note_seq_available,
-                           is_opencv_available, is_paddle_available,
-                           is_paddle_version, is_torch_available)
+from .import_utils import (
+    BACKENDS_MAPPING,
+    is_compel_available,
+    is_fastdeploy_available,
+    is_note_seq_available,
+    is_opencv_available,
+    is_paddle_available,
+    is_paddle_version,
+    is_torch_available,
+)
 from .logging import get_logger
 
 global_rng = random.Random()
@@ -51,7 +57,8 @@
         if paddle_device not in available_backends:
             raise ValueError(
                 f"unknown paddle backend for ppdiffusers tests: {paddle_device}. Available backends are:"
-                f" {available_backends}")
+                f" {available_backends}"
+            )
         logger.info(f"paddle_device overrode to {paddle_device}")
     else:
         paddle_device = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
@@ -74,25 +81,19 @@ def paddle_all_close(a, b, *args, **kwargs):
     if not is_paddle_available():
         raise ValueError("Paddle needs to be installed to use this function.")
     if not paddle.allclose(a, b, *args, **kwargs):
-        assert (
-            False
-        ), f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
+        assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
     return True
 
 
-def print_tensor_test(tensor,
-                      filename="test_corrections.txt",
-                      expected_tensor_name="expected_slice"):
+def print_tensor_test(tensor, filename="test_corrections.txt", expected_tensor_name="expected_slice"):
     test_name = os.environ.get("PYTEST_CURRENT_TEST")
     if not paddle.is_tensor(tensor):
         tensor = paddle.to_tensor(tensor)
 
-    tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace(
-        "\n", "")
+    tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace("\n", "")
     # format is usually:
     # expected_slice = np.array([-0.5713, -0.3018, -0.9814, 0.04663, -0.879, 0.76, -1.734, 0.1044, 1.161])
-    output_str = tensor_str.replace("tensor",
-                                    f"{expected_tensor_name} = np.array")
+    output_str = tensor_str.replace("tensor", f"{expected_tensor_name} = np.array")
     test_file, test_class, test_fn = test_name.split("::")
     test_fn = test_fn.split()[0]
     with open(filename, "a") as f:
@@ -182,27 +183,27 @@ def require_paddle_2_5(test_case):
     """
     return unittest.skipUnless(
         is_paddle_available() and is_paddle_version(">=", "2.5.0"),
-        "test requires Paddle 2.5", )(test_case)
+        "test requires Paddle 2.5",
+    )(test_case)
 
 
 def require_paddle(test_case):
     """
     Decorator marking a test that requires Paddle. These tests are skipped when Paddle isn't installed.
     """
-    return unittest.skipUnless(is_paddle_available(),
-                               "test requires Paddle")(test_case)
+    return unittest.skipUnless(is_paddle_available(), "test requires Paddle")(test_case)
 
 
 def require_torch(test_case):
     """Decorator marking a test that requires TORCH."""
-    return unittest.skipUnless(is_torch_available(),
-                               "test requires TORCH")(test_case)
+    return unittest.skipUnless(is_torch_available(), "test requires TORCH")(test_case)
 
 
 def require_paddle_gpu(test_case):
     """Decorator marking a test that requires CUDA and Paddle."""
-    return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu",
-                               "test requires Paddle+CUDA")(test_case)
+    return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu", "test requires Paddle+CUDA")(
+        test_case
+    )
 
 
 def require_compel(test_case):
@@ -210,38 +211,32 @@ def require_compel(test_case):
     Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when
     the library is not installed.
     """
-    return unittest.skipUnless(is_compel_available(),
-                               "test requires compel")(test_case)
+    return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case)
 
 
 def require_fastdeploy(test_case):
     """
     Decorator marking a test that requires fastdeploy. These tests are skipped when fastdeploy isn't installed.
     """
-    return unittest.skipUnless(is_fastdeploy_available(),
-                               "test requires fastdeploy")(test_case)
+    return unittest.skipUnless(is_fastdeploy_available(), "test requires fastdeploy")(test_case)
 
 
 def require_note_seq(test_case):
     """
     Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
     """
-    return unittest.skipUnless(is_note_seq_available(),
-                               "test requires note_seq")(test_case)
+    return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
 
 
-def load_numpy(arry: Union[str, np.ndarray],
-               local_path: Optional[str]=None) -> np.ndarray:
+def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
     if isinstance(arry, str):
         # local_path = "/home/patrick_huggingface_co/"
         if local_path is not None:
             # local_path can be passed to correct images of tests
             return os.path.join(
                 local_path,
-                "/".join([
-                    arry.split("/")[-5], arry.split("/")[-2],
-                    arry.split("/")[-1]
-                ]), )
+                "/".join([arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]]),
+            )
         elif arry.startswith("http://") or arry.startswith("https://"):
             response = requests.get(arry)
             response.raise_for_status()
@@ -257,7 +252,8 @@ def load_numpy(arry: Union[str, np.ndarray],
     else:
         raise ValueError(
             "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
-            " ndarray.")
+            " ndarray."
+        )
 
     return arry
 
@@ -320,20 +316,17 @@ def preprocess_image(image: PIL.Image, batch_size: int):
     return 2.0 * image - 1.0
 
 
-def export_to_video(video_frames: List[np.ndarray],
-                    output_video_path: str=None) -> str:
+def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
     if is_opencv_available():
         import cv2
     else:
-        raise ImportError(BACKENDS_MAPPING["opencv"][1].format(
-            "export_to_video"))
+        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
     if output_video_path is None:
         output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
 
-    fourcc = cv2.VideoWriter_fourcc(* "mp4v")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     h, w, c = video_frames[0].shape
-    video_writer = cv2.VideoWriter(
-        output_video_path, fourcc, fps=8, frameSize=(w, h))
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h))
     for i in range(len(video_frames)):
         img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
         video_writer.write(img)
@@ -344,7 +337,8 @@ def load_hf_numpy(path) -> np.ndarray:
     if not path.startswith("http://") or path.startswith("https://"):
         path = os.path.join(
             "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main",
-            urllib.parse.quote(path), )
+            urllib.parse.quote(path),
+        )
 
     return load_numpy(path)
 
@@ -353,7 +347,8 @@ def load_ppnlp_numpy(path) -> np.ndarray:
     if not path.startswith("http://") or path.startswith("https://"):
         path = os.path.join(
             "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/diffusers-testing",
-            urllib.parse.quote(path), )
+            urllib.parse.quote(path),
+        )
     return load_numpy(path)
 
 
@@ -444,9 +439,7 @@ def pytest_terminal_summary_main(tr, id):
             f.write("slowest durations\n")
             for i, rep in enumerate(dlist):
                 if rep.duration < durations_min:
-                    f.write(
-                        f"{len(dlist)-i} durations < {durations_min} secs were omitted"
-                    )
+                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
                     break
                 f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
 
@@ -460,8 +453,7 @@ def summary_failures_short(tr):
             msg = tr._getfailureheadline(rep)
             tr.write_sep("_", msg, red=True, bold=True)
             # chop off the optional leading extra frames, leaving only the last one
-            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0,
-                              re.M | re.S)
+            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
             tr._tw.line(longrepr)
             # note: not printing out any rep.sections to keep the report short
 
@@ -496,9 +488,7 @@ def summary_failures_short(tr):
         tr.summary_warnings()  # normal warnings
         tr.summary_warnings()  # final warnings
 
-    tr.reportchars = (
-        "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
-    )
+    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
     with open(report_files["passes"], "w") as f:
         tr._tw = create_terminal_writer(config, f)
         tr.summary_passes()
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
index 0940509378627..ea039412ef292 100644
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
+++ b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
@@ -20,18 +20,8 @@
 # This script references https://cocodataset.org/#keypoints-eval.
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-g",
-        "--gt",
-        type=str,
-        help="Assign the groud true path.",
-        default=None)
-    parser.add_argument(
-        "-d",
-        "--dt",
-        type=str,
-        help="Assign the detection result path.",
-        default=None)
+    parser.add_argument("-g", "--gt", type=str, help="Assign the groud true path.", default=None)
+    parser.add_argument("-d", "--dt", type=str, help="Assign the detection result path.", default=None)
     args = parser.parse_args()
 
     cocoGt = COCO(args.gt)
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
index 9e56042786a43..9679d0b744e9d 100644
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
+++ b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
@@ -23,6 +23,7 @@
 import paddle
 import paddlehub as hub
 from annotator.ppdet_hrnet.det_keypoint_unite_infer import PPDetPose
+
 # import PIL
 from PIL import Image
 from tqdm import tqdm
@@ -46,10 +47,8 @@ def keypoint_to_openpose_kpts(coco_keypoints_list):
     l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index]
     r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index]
 
-    neck_keypoint_y = int(
-        (l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
-    neck_keypoint_x = int(
-        (l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
+    neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
+    neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
     neck_keypoint = [
         neck_keypoint_x,
         neck_keypoint_y,
@@ -72,19 +71,19 @@ def __call__(self, oriImg, detect_resolution=512, hand=False):
             img_scalarfactor = detect_resolution / min(oriImg.shape[:2])
             result, poseres = self.ppdetpose_pred(oriImg)
             result["candidate"] = result["candidate"] * img_scalarfactor
-            oriImg = cv2.resize(
-                oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
+            oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
             canvas = oriImg.copy()
             canvas.fill(0)
-            canvas = self.body_estimation.draw_pose(canvas, result["candidate"],
-                                                    result["subset"])
+            canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
 
             return (
                 canvas,
                 dict(
                     candidate=result["candidate"].tolist(),
-                    subset=result["subset"].tolist(), ),
-                poseres, )
+                    subset=result["subset"].tolist(),
+                ),
+                poseres,
+            )
 
     def ppdetpose_pred(self, image, kpt_threshold=0.3):
         poseres = self.ppdetpose.ppdet_hrnet_infer(image)
@@ -98,7 +97,12 @@ def ppdetpose_pred(self, image, kpt_threshold=0.3):
             for idx, item in enumerate(openpose_kpts):
                 if item[2] > kpt_threshold:
                     subset[kptid][idx] = posnum
-                    kpt = np.array(item + [posnum, ])
+                    kpt = np.array(
+                        item
+                        + [
+                            posnum,
+                        ]
+                    )
                     candidate = np.vstack((candidate, kpt))
                     posnum += 1
         return {"candidate": candidate, "subset": subset}, poseres
@@ -138,7 +142,8 @@ def resize_image(input_image, resolution):
     img = cv2.resize(
         input_image,
         (W, H),
-        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, )
+        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
+    )
     return img
 
 
@@ -151,11 +156,7 @@ def get_keypoints_result_coco_format(paths, detector, do_gt):
         out_dir_path = pathlib.Path(paths[2])
         if not os.path.exists(out_dir_path):
             os.makedirs(out_dir_path)
-    files = sorted([
-        file
-        for ext in IMAGE_EXTENSIONS
-        for file in in_dir_path.glob("*.{}".format(ext))
-    ])
+    files = sorted([file for ext in IMAGE_EXTENSIONS for file in in_dir_path.glob("*.{}".format(ext))])
     output = []
     index = -1
     for file in tqdm(files):
@@ -165,8 +166,7 @@ def get_keypoints_result_coco_format(paths, detector, do_gt):
         input_image = HWC3(im)
         canvas, keypoints_result, poseres = detector(input_image)
         if len(paths) == 3:
-            Image.fromarray(canvas).save(
-                os.path.join(out_dir_path, os.path.basename(file)))
+            Image.fromarray(canvas).save(os.path.join(out_dir_path, os.path.basename(file)))
         if len(poseres["keypoint"][0]) == 0:
             sample_dict = {
                 "image_id": index,
@@ -209,76 +209,72 @@ def get_keypoints_result_coco_format(paths, detector, do_gt):
                 json.dumps(
                     {
                         "annotations": output,
-                        "images": [{
-                            "id": item
-                        } for item in list(range(index + 1))],
-                        "categories": [{
-                            "supercategory": "person",
-                            "id": 1,
-                            "name": "person",
-                            "keypoints": [
-                                "nose",
-                                "left_eye",
-                                "right_eye",
-                                "left_ear",
-                                "right_ear",
-                                "left_shoulder",
-                                "right_shoulder",
-                                "left_elbow",
-                                "right_elbow",
-                                "left_wrist",
-                                "right_wrist",
-                                "left_hip",
-                                "right_hip",
-                                "left_knee",
-                                "right_knee",
-                                "left_ankle",
-                                "right_ankle",
-                            ],
-                            "skeleton": [
-                                [16, 14],
-                                [14, 12],
-                                [17, 15],
-                                [15, 13],
-                                [12, 13],
-                                [6, 12],
-                                [7, 13],
-                                [6, 7],
-                                [6, 8],
-                                [7, 9],
-                                [8, 10],
-                                [9, 11],
-                                [2, 3],
-                                [1, 2],
-                                [1, 3],
-                                [2, 4],
-                                [3, 5],
-                                [4, 6],
-                                [5, 7],
-                            ],
-                        }],
+                        "images": [{"id": item} for item in list(range(index + 1))],
+                        "categories": [
+                            {
+                                "supercategory": "person",
+                                "id": 1,
+                                "name": "person",
+                                "keypoints": [
+                                    "nose",
+                                    "left_eye",
+                                    "right_eye",
+                                    "left_ear",
+                                    "right_ear",
+                                    "left_shoulder",
+                                    "right_shoulder",
+                                    "left_elbow",
+                                    "right_elbow",
+                                    "left_wrist",
+                                    "right_wrist",
+                                    "left_hip",
+                                    "right_hip",
+                                    "left_knee",
+                                    "right_knee",
+                                    "left_ankle",
+                                    "right_ankle",
+                                ],
+                                "skeleton": [
+                                    [16, 14],
+                                    [14, 12],
+                                    [17, 15],
+                                    [15, 13],
+                                    [12, 13],
+                                    [6, 12],
+                                    [7, 13],
+                                    [6, 7],
+                                    [6, 8],
+                                    [7, 9],
+                                    [8, 10],
+                                    [9, 11],
+                                    [2, 3],
+                                    [1, 2],
+                                    [1, 3],
+                                    [2, 4],
+                                    [3, 5],
+                                    [4, 6],
+                                    [5, 7],
+                                ],
+                            }
+                        ],
                     },
-                    indent=4, ))
+                    indent=4,
+                )
+            )
         else:
             json_file.write(json.dumps(output, indent=4))
 
 
 parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument(
-    "--do_gt",
-    action="store_true",
-    help="whether to predict unseen future data")
+parser.add_argument("--do_gt", action="store_true", help="whether to predict unseen future data")
 parser.add_argument(
     "path",
     type=str,
     nargs=3,
-    help=(
-        "Paths to the input images dir, output json file, and output openpose images dir"
-    ), )
+    help=("Paths to the input images dir, output json file, and output openpose images dir"),
+)
 
-IMAGE_EXTENSIONS = {
-    "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"
-}
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
 
 if __name__ == "__main__":
     args = parser.parse_args()
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
index 015f143827f2b..d5fb70e8d90c9 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
@@ -16,17 +16,24 @@
 
 import paddle
 import torch
-from diffusers import \
-    StableDiffusionImageVariationPipeline as \
-    DiffusersStableDiffusionImageVariationPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPVisionConfig,
-                                    CLIPVisionModelWithProjection)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler)
-from ppdiffusers import \
-    StableDiffusionImageVariationPipeline as \
-    PPDiffusersStableDiffusionImageVariationPipeline
+from diffusers import (
+    StableDiffusionImageVariationPipeline as DiffusersStableDiffusionImageVariationPipeline,
+)
+from paddlenlp.transformers import (
+    CLIPFeatureExtractor,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ppdiffusers import (
+    StableDiffusionImageVariationPipeline as PPDiffusersStableDiffusionImageVariationPipeline,
+)
 from ppdiffusers import UNet2DConditionModel
 from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 
@@ -47,10 +54,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
     return new_vae_or_unet
 
 
-def convert_hf_clip_to_ppnlp_clip(clip,
-                                  dtype="float32",
-                                  is_text_encoder=True,
-                                  need_prefix=False):
+def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
     new_model_state = {}
     transformers2ppnlp = {
         ".encoder.": ".transformer.",
@@ -69,9 +73,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -85,7 +87,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name and need_prefix:
             name = "clip." + name
@@ -123,8 +125,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
                 "vision_heads": clip.config.num_attention_heads,
                 "vision_embed_dim": clip.config.hidden_size,
                 "vision_patch_size": clip.config.patch_size,
-                "vision_mlp_ratio":
-                clip.config.intermediate_size // clip.config.hidden_size,
+                "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
                 "vision_hidden_act": clip.config.hidden_act,
                 "projection_dim": clip.config.projection_dim,
             }
@@ -148,17 +149,19 @@ def check_keys(model, state_dict):
         print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
 
 
-def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
-                                     output_path=None):
+def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersStableDiffusionImageVariationPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
     image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False)
+        diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False
+    )
     safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True)
+        diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True
+    )
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -173,18 +176,14 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
 
     # make sure
     vision_config.update({"projection_dim": pp_unet.config.cross_attention_dim})
-    safety_checker_config.update({
-        "projection_dim": pp_unet.config.cross_attention_dim
-    })
+    safety_checker_config.update({"projection_dim": pp_unet.config.cross_attention_dim})
 
     # 3. image_encoder
-    image_encoder = CLIPVisionModelWithProjection(
-        CLIPVisionConfig.from_dict(vision_config))
+    image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config))
     image_encoder.set_dict(image_encoder_state_dict)
     check_keys(image_encoder, image_encoder_state_dict)
     # 4. safety_checker
-    pp_safety_checker = StableDiffusionSafetyChecker(
-        CLIPVisionConfig.from_dict(safety_checker_config))
+    pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
     pp_safety_checker.set_dict(safety_checker_state_dict)
     check_keys(pp_safety_checker, safety_checker_state_dict)
     # 5. scheduler
@@ -200,12 +199,10 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif "ddim" in scheduler_type:
         pp_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -214,12 +211,12 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
-    pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-        "CompVis/stable-diffusion-v1-4/feature_extractor")
+    pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
 
     # 7. create ppdiffusers pipe
     paddle_pipe = PPDiffusersStableDiffusionImageVariationPipeline(
@@ -228,15 +225,15 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
         unet=pp_unet,
         safety_checker=pp_safety_checker,
         feature_extractor=pp_feature_extractor,
-        scheduler=pp_scheduler, )
+        scheduler=pp_scheduler,
+    )
     # 8. save_pretrained
     paddle_pipe.save_pretrained(output_path)
     return paddle_pipe
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -247,7 +244,7 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
         "--output_path",
         type=str,
         default="sd-image-variations-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+    ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
index 756e12bb3c97b..f0a64446ba1d7 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
@@ -17,14 +17,21 @@
 
 import paddle
 import torch
-from diffusers import \
-    StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline
+from diffusers import (
+    StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline,
+)
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         LMSDiscreteScheduler, PNDMScheduler)
-from ppdiffusers import \
-    StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ppdiffusers import (
+    StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline,
+)
 from ppdiffusers import UNet2DConditionModel
 
 paddle.set_device("cpu")
@@ -63,9 +70,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -79,7 +84,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name:
             name = "clip." + name
@@ -104,23 +109,23 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             "vision_heads": clip.config.vision_config.num_attention_heads,
             "vision_embed_dim": clip.config.vision_config.hidden_size,
             "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
-            clip.config.vision_config.hidden_size,
+            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
             "vision_hidden_act": clip.config.vision_config.hidden_act,
             "projection_dim": clip.config.projection_dim,
         }
     return new_model_state, new_config
 
 
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersStableDiffusionUpscalePipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
     text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True)
+        diffusers_pipe.text_encoder, is_text_encoder=True
+    )
     max_noise_level = diffusers_pipe.max_noise_level
 
     # 1. vae
@@ -134,8 +139,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
     pp_unet.set_dict(unet_state_dict)
 
     # 3. text_encoder
-    pp_text_encoder = CLIPTextModel(
-        CLIPTextConfig.from_dict(text_encoder_config))
+    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
 
     # 4. scheduler
@@ -150,12 +154,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             beta_schedule=beta_schedule,
             beta_start=beta_start,
             num_train_timesteps=num_train_timesteps,
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule=beta_schedule)
+        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule)
     elif "ddim" in scheduler_type:
         pp_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -164,7 +166,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             clip_sample=False,
             prediction_type="v_prediction",
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
@@ -183,18 +186,19 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "ddpm" in scheduler_type:
         pp_low_res_scheduler = DDPMScheduler(
             beta_end=beta_end,
             beta_schedule=beta_schedule,
             beta_start=beta_start,
-            num_train_timesteps=num_train_timesteps, )
+            num_train_timesteps=num_train_timesteps,
+        )
     elif "lms" in scheduler_type:
         pp_low_res_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule=beta_schedule)
+            beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule
+        )
     elif "ddim" in scheduler_type:
         pp_low_res_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -203,7 +207,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
@@ -219,7 +224,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             tokenizer=pp_tokenizer,
             unet=pp_unet,
             low_res_scheduler=pp_low_res_scheduler,
-            scheduler=pp_scheduler, )
+            scheduler=pp_scheduler,
+        )
 
         # 9. save_pretrained
         paddle_pipe.save_pretrained(output_path)
@@ -227,8 +233,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -239,7 +244,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
         "--output_path",
         type=str,
         default="stable-diffusion-x4-upscaler-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
index 4c29f3059b3a1..b3e0ece7e6a03 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
@@ -17,16 +17,26 @@
 
 import paddle
 import torch
-from diffusers import \
-    VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline
+from diffusers import VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline
 from paddlenlp.transformers import (
-    CLIPFeatureExtractor, CLIPTextConfig, CLIPTextModelWithProjection,
-    CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection)
+    CLIPFeatureExtractor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, UNet2DConditionModel)
-from ppdiffusers import \
-    VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from ppdiffusers import (
+    VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline,
+)
 from ppdiffusers.pipelines.versatile_diffusion import UNetFlatConditionModel
 
 paddle.set_device("cpu")
@@ -46,10 +56,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
     return new_vae_or_unet
 
 
-def convert_hf_clip_to_ppnlp_clip(clip,
-                                  dtype="float32",
-                                  is_text_encoder=True,
-                                  need_prefix=False):
+def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
     new_model_state = {}
     transformers2ppnlp = {
         ".encoder.": ".transformer.",
@@ -68,9 +75,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -84,7 +89,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name and need_prefix:
             name = "clip." + name
@@ -122,8 +127,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
                 "vision_heads": clip.config.num_attention_heads,
                 "vision_embed_dim": clip.config.hidden_size,
                 "vision_patch_size": clip.config.patch_size,
-                "vision_mlp_ratio":
-                clip.config.intermediate_size // clip.config.hidden_size,
+                "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
                 "vision_hidden_act": clip.config.hidden_act,
                 "projection_dim": clip.config.projection_dim,
             }
@@ -147,20 +151,22 @@ def check_keys(model, state_dict):
         print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
 
 
-def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
-                                     output_path=None):
+def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersVersatileDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     image_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.image_unet)
     text_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_unet)
 
     text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False)
+        diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False
+    )
 
     image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False)
+        diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False
+    )
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -179,14 +185,12 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
     check_keys(pp_text_unet, text_unet_state_dict)
 
     # 4. image_encoder
-    pp_image_encoder = CLIPVisionModelWithProjection(
-        CLIPVisionConfig.from_dict(vision_config))
+    pp_image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config))
     pp_image_encoder.set_dict(image_encoder_state_dict)
     check_keys(pp_image_encoder, image_encoder_state_dict)
 
     # 5. text_encoder
-    pp_text_encoder = CLIPTextModelWithProjection(
-        CLIPTextConfig.from_dict(text_config))
+    pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
     check_keys(pp_text_encoder, text_encoder_state_dict)
 
@@ -203,12 +207,10 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif "ddim" in scheduler_type:
         pp_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -217,13 +219,13 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
     with tempfile.TemporaryDirectory() as tmpdirname:
-        pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-            "CompVis/stable-diffusion-v1-4/feature_extractor")
+        pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
         # 7. tokenizer
         diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
         pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
@@ -236,15 +238,15 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
             image_unet=pp_image_unet,
             text_unet=pp_text_unet,
             vae=pp_vae,
-            scheduler=pp_scheduler, )
+            scheduler=pp_scheduler,
+        )
         # 9. save_pretrained
         paddle_pipe.save_pretrained(output_path)
     return paddle_pipe
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -255,7 +257,7 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path,
         "--output_path",
         type=str,
         default="versatile-diffusion-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+    ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
index 62de6daa072d9..ff8c68985a249 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
@@ -18,17 +18,28 @@
 import paddle
 import torch
 from diffusers import AltDiffusionPipeline as DiffusersAltDiffusionPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPVisionConfig,
-                                    XLMRobertaTokenizer)
+from paddlenlp.transformers import (
+    CLIPFeatureExtractor,
+    CLIPVisionConfig,
+    XLMRobertaTokenizer,
+)
 
 from ppdiffusers import AltDiffusionPipeline as PPDiffusersAltDiffusionPipeline
 from ppdiffusers import (
-    AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
-    UNet2DConditionModel)
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig, RobertaSeriesModelWithTransformation)
+    RobertaSeriesConfig,
+    RobertaSeriesModelWithTransformation,
+)
 from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 
 paddle.set_device("cpu")
@@ -67,9 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -83,7 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name:
             name = "clip." + name
@@ -108,8 +117,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             "vision_heads": clip.config.vision_config.num_attention_heads,
             "vision_embed_dim": clip.config.vision_config.hidden_size,
             "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
-            clip.config.vision_config.hidden_size,
+            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
             "vision_hidden_act": clip.config.vision_config.hidden_act,
             "projection_dim": clip.config.projection_dim,
         }
@@ -119,10 +127,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
 def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"):
     new_model_state = {}
     mappings = [
-        [
-            "embeddings.word_embeddings.weight",
-            "embeddings.word_embeddings.weight"
-        ],
+        ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
         [
             "embeddings.position_embeddings.weight",
             "embeddings.position_embeddings.weight",
@@ -224,21 +229,17 @@ def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"):
             hf_name = prefix + hf_name
             pp_name = prefix + pp_name
         if need_transpose:
-            new_model_state[pp_name] = (
-                state_dict[hf_name].t().cpu().numpy().astype(dtype))
+            new_model_state[pp_name] = state_dict[hf_name].t().cpu().numpy().astype(dtype)
         else:
-            new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype(
-                dtype)
+            new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype(dtype)
 
     new_config = xlm_roberta.config.to_dict()
     return new_model_state, new_config
 
 
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+    diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
     (
@@ -246,7 +247,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
         text_encoder_config,
     ) = convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(diffusers_pipe.text_encoder)
     safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.safety_checker, is_text_encoder=False)
+        diffusers_pipe.safety_checker, is_text_encoder=False
+    )
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -264,8 +266,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
     pp_text_encoder.set_dict(text_encoder_state_dict)
 
     # 4. safety_checker
-    pp_safety_checker = StableDiffusionSafetyChecker(
-        CLIPVisionConfig.from_dict(safety_checker_config))
+    pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
     pp_safety_checker.set_dict(safety_checker_state_dict)
 
     # 5. scheduler
@@ -281,7 +282,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
         num_train_timesteps=num_train_timesteps,
         steps_offset=1,
         clip_sample=False,
-        set_alpha_to_one=False, )
+        set_alpha_to_one=False,
+    )
     # make sure scheduler works correctly with DDIM
     scheduler.register_to_config(clip_sample=False)
 
@@ -296,8 +298,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
     elif scheduler_type == "euler":
         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
     elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            scheduler.config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
     elif scheduler_type == "dpm":
         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
     elif scheduler_type == "ddim":
@@ -308,8 +309,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
     with tempfile.TemporaryDirectory() as tmpdirname:
         # 6. feature_extractor
         # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
-        pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-            "CompVis/stable-diffusion-v1-4/feature_extractor")
+        pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
 
         # 7. tokenizer
         diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
@@ -323,15 +323,15 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             unet=pp_unet,
             safety_checker=pp_safety_checker,
             feature_extractor=pp_feature_extractor,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
         # 9. save_pretrained
         paddle_pipe.save_pretrained(output_path)
     return paddle_pipe
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -342,7 +342,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
         "--output_path",
         type=str,
         default="AltDiffusion-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
index 17aa70d3ef95a..bd8d3e8bbb152 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
@@ -40,8 +40,7 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -52,11 +51,11 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"):
         "--output_path",
         type=str,
         default="paddle_models/sd-controlnet-canny",
-        help="The output path.", )
+        help="The output path.",
+    )
     args = parser.parse_args()
 
-    th_controlnet = DiffusersControlNetModel.from_pretrained(
-        args.pretrained_model_name_or_path)
+    th_controlnet = DiffusersControlNetModel.from_pretrained(args.pretrained_model_name_or_path)
     controlnet_state_dict = convert_to_ppdiffusers(th_controlnet)
     pp_controlnet = PPDiffusersControlNetModel.from_config(th_controlnet.config)
     pp_controlnet.set_dict(controlnet_state_dict)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
index 021da51309528..7cd30d3c3e077 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
@@ -21,10 +21,8 @@
 from paddlenlp.transformers import BertTokenizer
 
 from ppdiffusers import AutoencoderKL, DDIMScheduler, LDMBertModel
-from ppdiffusers import \
-    LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline
-from ppdiffusers import (LMSDiscreteScheduler, PNDMScheduler,
-                         UNet2DConditionModel)
+from ppdiffusers import LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline
+from ppdiffusers import LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 
 paddle.set_device("cpu")
 
@@ -87,15 +85,14 @@ def convert_hf_ldmbert_to_ppnlp_ldmbert(ldmbert, dtype="float32"):
     return new_model_state, new_config
 
 
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersLDMTextToImagePipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
     vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert(
-        diffusers_pipe.bert)
+    bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert(diffusers_pipe.bert)
 
     # 1. vqvae
     pp_vqvae = AutoencoderKL.from_config(diffusers_pipe.vqvae.config)
@@ -123,12 +120,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif "ddim" in scheduler_type:
         pp_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -137,15 +132,15 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
     with tempfile.TemporaryDirectory() as tmpdirname:
         # 5. tokenizer
         diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = BertTokenizer.from_pretrained(
-            tmpdirname, model_max_length=77)
+        pp_tokenizer = BertTokenizer.from_pretrained(tmpdirname, model_max_length=77)
 
         # 6. create ppdiffusers pipe
         paddle_pipe = PPDiffusersLDMTextToImagePipeline(
@@ -153,7 +148,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             bert=pp_bert,
             tokenizer=pp_tokenizer,
             unet=pp_unet,
-            scheduler=pp_scheduler, )
+            scheduler=pp_scheduler,
+        )
 
         # 7. save_pretrained
         paddle_pipe.save_pretrained(output_path)
@@ -161,8 +157,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -173,7 +168,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
         "--output_path",
         type=str,
         default="ldm-text2im-large-256-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
index 212808cd405fa..519d032808939 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
@@ -18,15 +18,16 @@
 import paddle
 import torch
 from diffusers import PaintByExamplePipeline as DiffusersPaintByExamplePipeline
+
 # CLIPImageProcessor need paddlenlp latest
 from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig
 
 from ppdiffusers import AutoencoderKL
-from ppdiffusers import \
-    PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline
+from ppdiffusers import PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline
 from ppdiffusers import PNDMScheduler, UNet2DConditionModel
-from ppdiffusers.pipelines.paint_by_example.image_encoder import \
-    PaintByExampleImageEncoder
+from ppdiffusers.pipelines.paint_by_example.image_encoder import (
+    PaintByExampleImageEncoder,
+)
 
 paddle.set_device("cpu")
 
@@ -63,9 +64,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
         ".post_layernorm.": ".ln_post.",
     }
     ignore_value = ["position_ids", "mapper"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids and mapper
@@ -79,7 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
 
         new_model_state[name] = value.cpu().numpy().astype(dtype)
 
@@ -93,8 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
         "vision_heads": clip.config.num_attention_heads,
         "vision_embed_dim": clip.config.hidden_size,
         "vision_patch_size": clip.config.patch_size,
-        "vision_mlp_ratio":
-        clip.config.intermediate_size // clip.config.hidden_size,
+        "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
         "vision_hidden_act": clip.config.hidden_act,
         "projection_dim": clip.config.projection_dim,
     }
@@ -118,15 +116,14 @@ def check_keys(model, state_dict):
         print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
 
 
-def convert_diffusers_paintbyexample_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_paintbyexample_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersPaintByExamplePipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.image_encoder)
+    image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.image_encoder)
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -138,8 +135,7 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
     check_keys(pp_unet, unet_state_dict)
 
     # 3. image_encoder
-    pp_image_encoder = PaintByExampleImageEncoder(
-        CLIPVisionConfig.from_dict(image_encoder_config))
+    pp_image_encoder = PaintByExampleImageEncoder(CLIPVisionConfig.from_dict(image_encoder_config))
     pp_image_encoder.set_dict(image_encoder_state_dict)
     check_keys(pp_image_encoder, image_encoder_state_dict)
     # 4. scheduler
@@ -158,7 +154,8 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
             scheduler=pp_scheduler,
             safety_checker=None,
             feature_extractor=feature_extractor,
-            requires_safety_checker=False, )
+            requires_safety_checker=False,
+        )
 
         # 6. save_pretrained
         paddle_pipe.save_pretrained(output_path)
@@ -166,8 +163,7 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -178,7 +174,9 @@ def convert_diffusers_paintbyexample_to_ppdiffusers(
         "--output_path",
         type=str,
         default="./Paint-by-Example",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_paintbyexample_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
index f1d3d6bd2462f..fa189095cbb9d 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
@@ -17,17 +17,22 @@
 
 import paddle
 import torch
-from diffusers import \
-    StableDiffusionDepth2ImgPipeline as \
-    DiffusersStableDiffusionDepth2ImgPipeline
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
-                                    CLIPTokenizer, DPTConfig,
-                                    DPTForDepthEstimation, DPTImageProcessor)
+from diffusers import (
+    StableDiffusionDepth2ImgPipeline as DiffusersStableDiffusionDepth2ImgPipeline,
+)
+from paddlenlp.transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    DPTConfig,
+    DPTForDepthEstimation,
+    DPTImageProcessor,
+)
 
 from ppdiffusers import AutoencoderKL, PNDMScheduler
-from ppdiffusers import \
-    StableDiffusionDepth2ImgPipeline as \
-    PPDiffusersStableDiffusionDepth2ImgPipeline
+from ppdiffusers import (
+    StableDiffusionDepth2ImgPipeline as PPDiffusersStableDiffusionDepth2ImgPipeline,
+)
 from ppdiffusers import UNet2DConditionModel
 
 paddle.set_device("cpu")
@@ -66,9 +71,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -82,7 +85,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
 
         new_model_state[name] = value.cpu().numpy().astype(dtype)
 
@@ -117,17 +120,15 @@ def check_keys(model, state_dict):
         print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
 
 
-def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersStableDiffusionDepth2ImgPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    depth_estimator_state_dict = convert_to_ppdiffusers(
-        diffusers_pipe.depth_estimator)
-    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder)
+    depth_estimator_state_dict = convert_to_ppdiffusers(diffusers_pipe.depth_estimator)
+    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.text_encoder)
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -138,8 +139,7 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
     pp_unet.set_dict(unet_state_dict)
     check_keys(pp_unet, unet_state_dict)
     # 3. text_encoder
-    pp_text_encoder = CLIPTextModel(
-        CLIPTextConfig.from_dict(text_encoder_config))
+    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
     check_keys(pp_text_encoder, text_encoder_state_dict)
     # 4. scheduler
@@ -168,7 +168,8 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
             unet=pp_unet,
             feature_extractor=pp_feature_extractor,
             depth_estimator=pp_depth_estimator,
-            scheduler=pp_scheduler, )
+            scheduler=pp_scheduler,
+        )
 
         # 9. save_pretrained
         paddle_pipe.save_pretrained(output_path)
@@ -176,8 +177,7 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -188,7 +188,9 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
         "--output_path",
         type=str,
         default="stable-diffusion-2-depth",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
index 9ec5f95b55248..bd8178c872874 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
@@ -17,18 +17,27 @@
 
 import paddle
 import torch
-from diffusers import \
-    StableDiffusionControlNetPipeline as \
-    DiffusersStableDiffusionControlNetPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextConfig,
-                                    CLIPTextModel, CLIPTokenizer,
-                                    CLIPVisionConfig)
-
-from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler,
-                         LMSDiscreteScheduler, PNDMScheduler)
-from ppdiffusers import \
-    StableDiffusionControlNetPipeline as \
-    PPDiffusersStableDiffusionControlNetPipeline
+from diffusers import (
+    StableDiffusionControlNetPipeline as DiffusersStableDiffusionControlNetPipeline,
+)
+from paddlenlp.transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+)
+
+from ppdiffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ppdiffusers import (
+    StableDiffusionControlNetPipeline as PPDiffusersStableDiffusionControlNetPipeline,
+)
 from ppdiffusers import UNet2DConditionModel
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
@@ -69,9 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -85,7 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name:
             name = "clip." + name
@@ -110,26 +117,25 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             "vision_heads": clip.config.vision_config.num_attention_heads,
             "vision_embed_dim": clip.config.vision_config.hidden_size,
             "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
-            clip.config.vision_config.hidden_size,
+            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
             "vision_hidden_act": clip.config.vision_config.hidden_act,
             "projection_dim": clip.config.projection_dim,
         }
     return new_model_state, new_config
 
 
-def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersStableDiffusionControlNetPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
-    requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker",
-                                      False)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
+    requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False)
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
     controlnet_state_dict = convert_to_ppdiffusers(diffusers_pipe.controlnet)
     text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True)
+        diffusers_pipe.text_encoder, is_text_encoder=True
+    )
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -142,14 +148,12 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
     pp_unet.set_dict(unet_state_dict)
 
     # 3. controlnet
-    pp_controlnet = ControlNetModel.from_config(
-        diffusers_pipe.controlnet.config)
+    pp_controlnet = ControlNetModel.from_config(diffusers_pipe.controlnet.config)
 
     pp_controlnet.set_dict(controlnet_state_dict)
 
     # 4. text_encoder
-    pp_text_encoder = CLIPTextModel(
-        CLIPTextConfig.from_dict(text_encoder_config))
+    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
 
     # 5. scheduler
@@ -165,12 +169,10 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif "ddim" in scheduler_type:
         pp_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -179,7 +181,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
@@ -192,14 +195,14 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
             # 7. feature_extractor
             # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
             pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-v1-4/feature_extractor")
+                "CompVis/stable-diffusion-v1-4/feature_extractor"
+            )
             # 8. safety_checker
             (
                 safety_checker_state_dict,
-                safety_checker_config, ) = convert_hf_clip_to_ppnlp_clip(
-                    diffusers_pipe.safety_checker, is_text_encoder=False)
-            pp_safety_checker = StableDiffusionSafetyChecker(
-                CLIPVisionConfig.from_dict(safety_checker_config))
+                safety_checker_config,
+            ) = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.safety_checker, is_text_encoder=False)
+            pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
             pp_safety_checker.set_dict(safety_checker_state_dict)
             # 9. create ppdiffusers pipe
             paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline(
@@ -210,7 +213,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
                 controlnet=pp_controlnet,
                 safety_checker=pp_safety_checker,
                 feature_extractor=pp_feature_extractor,
-                scheduler=pp_scheduler, )
+                scheduler=pp_scheduler,
+            )
         else:
             # 9. create ppdiffusers pipe
             paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline(
@@ -222,7 +226,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
                 safety_checker=None,
                 feature_extractor=None,
                 scheduler=pp_scheduler,
-                requires_safety_checker=False, )
+                requires_safety_checker=False,
+            )
         if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path:
             _internal_dict = dict(paddle_pipe._internal_dict)
             if _internal_dict["_ppdiffusers_version"] == "0.0.0":
@@ -234,8 +239,7 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -246,7 +250,9 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
         "--output_path",
         type=str,
         default="control_sd15_canny-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
index 6d3811cc0bc82..a3374a432caa4 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
@@ -17,16 +17,22 @@
 
 import paddle
 import torch
-from diffusers import \
-    StableDiffusionPipeline as DiffusersStableDiffusionPipeline
-from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextConfig,
-                                    CLIPTextModel, CLIPTokenizer,
-                                    CLIPVisionConfig)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler)
-from ppdiffusers import \
-    StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline
+from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline
+from paddlenlp.transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+)
+
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline
 from ppdiffusers import UNet2DConditionModel
 from ppdiffusers.configuration_utils import FrozenDict
 from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
@@ -67,9 +73,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -83,7 +87,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name:
             name = "clip." + name
@@ -108,25 +112,24 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             "vision_heads": clip.config.vision_config.num_attention_heads,
             "vision_embed_dim": clip.config.vision_config.hidden_size,
             "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
-            clip.config.vision_config.hidden_size,
+            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
             "vision_hidden_act": clip.config.vision_config.hidden_act,
             "projection_dim": clip.config.projection_dim,
         }
     return new_model_state, new_config
 
 
-def convert_diffusers_stable_diffusion_to_ppdiffusers(
-        pretrained_model_name_or_path, output_path=None):
+def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
     diffusers_pipe = DiffusersStableDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
-    requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker",
-                                      False)
+        pretrained_model_name_or_path, use_auth_token=True
+    )
+    requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False)
     vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
     unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
     text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True)
+        diffusers_pipe.text_encoder, is_text_encoder=True
+    )
 
     # 1. vae
     pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
@@ -139,8 +142,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
     pp_unet.set_dict(unet_state_dict)
 
     # 3. text_encoder
-    pp_text_encoder = CLIPTextModel(
-        CLIPTextConfig.from_dict(text_encoder_config))
+    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
 
     # 4. scheduler
@@ -156,12 +158,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif "ddim" in scheduler_type:
         pp_scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -170,7 +170,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
@@ -183,14 +184,14 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
             # 6. feature_extractor
             # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
             pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-v1-4/feature_extractor")
+                "CompVis/stable-diffusion-v1-4/feature_extractor"
+            )
             # 7. safety_checker
             (
                 safety_checker_state_dict,
-                safety_checker_config, ) = convert_hf_clip_to_ppnlp_clip(
-                    diffusers_pipe.safety_checker, is_text_encoder=False)
-            pp_safety_checker = StableDiffusionSafetyChecker(
-                CLIPVisionConfig.from_dict(safety_checker_config))
+                safety_checker_config,
+            ) = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.safety_checker, is_text_encoder=False)
+            pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
             pp_safety_checker.set_dict(safety_checker_state_dict)
             # 8. create ppdiffusers pipe
             paddle_pipe = PPDiffusersStableDiffusionPipeline(
@@ -200,7 +201,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
                 unet=pp_unet,
                 safety_checker=pp_safety_checker,
                 feature_extractor=pp_feature_extractor,
-                scheduler=pp_scheduler, )
+                scheduler=pp_scheduler,
+            )
         else:
             # 8. create ppdiffusers pipe
             paddle_pipe = PPDiffusersStableDiffusionPipeline(
@@ -211,7 +213,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
                 safety_checker=None,
                 feature_extractor=None,
                 scheduler=pp_scheduler,
-                requires_safety_checker=False, )
+                requires_safety_checker=False,
+            )
         if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path:
             _internal_dict = dict(paddle_pipe._internal_dict)
             if _internal_dict["_ppdiffusers_version"] == "0.0.0":
@@ -223,8 +226,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -235,7 +237,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers(
         "--output_path",
         type=str,
         default="stable-diffusion-v1-5-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
index c5c28bfce9e02..204766187c39c 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
@@ -18,8 +18,11 @@
 import paddle
 import torch
 from diffusers import UnCLIPPipeline as DiffusersUnCLIPPipeline
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModelWithProjection,
-                                    CLIPTokenizer)
+from paddlenlp.transformers import (
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
 
 from ppdiffusers import PriorTransformer
 from ppdiffusers import UnCLIPPipeline as PPDiffusersUnCLIPPipeline
@@ -43,10 +46,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32", prefix=""):
     return new_vae_or_unet
 
 
-def convert_hf_clip_to_ppnlp_clip(clip,
-                                  dtype="float32",
-                                  is_text_encoder=True,
-                                  need_prefix=False):
+def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
     new_model_state = {}
     transformers2ppnlp = {
         ".encoder.": ".transformer.",
@@ -65,9 +65,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -81,7 +79,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name and need_prefix:
             name = "clip." + name
@@ -119,8 +117,7 @@ def convert_hf_clip_to_ppnlp_clip(clip,
                 "vision_heads": clip.config.num_attention_heads,
                 "vision_embed_dim": clip.config.hidden_size,
                 "vision_patch_size": clip.config.patch_size,
-                "vision_mlp_ratio":
-                clip.config.intermediate_size // clip.config.hidden_size,
+                "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
                 "vision_hidden_act": clip.config.hidden_act,
                 "projection_dim": clip.config.projection_dim,
             }
@@ -144,20 +141,17 @@ def check_keys(model, state_dict):
         print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
 
 
-def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
-                                            output_path=None):
+def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
     # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+    diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
     prior_state_dict = convert_to_ppdiffusers(diffusers_pipe.prior)
     decoder_state_dict = convert_to_ppdiffusers(diffusers_pipe.decoder)
     text_proj_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_proj)
-    super_res_first_state_dict = convert_to_ppdiffusers(
-        diffusers_pipe.super_res_first)
-    super_res_last_state_dict = convert_to_ppdiffusers(
-        diffusers_pipe.super_res_last)
+    super_res_first_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_first)
+    super_res_last_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_last)
     text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False)
+        diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False
+    )
 
     pp_prior = PriorTransformer.from_config(diffusers_pipe.prior.config)
     pp_prior.set_dict(prior_state_dict)
@@ -167,32 +161,25 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
     pp_decoder.set_dict(decoder_state_dict)
     check_keys(pp_decoder, decoder_state_dict)
 
-    pp_text_proj = UnCLIPTextProjModel.from_config(
-        diffusers_pipe.text_proj.config)
+    pp_text_proj = UnCLIPTextProjModel.from_config(diffusers_pipe.text_proj.config)
     pp_text_proj.set_dict(text_proj_state_dict)
     check_keys(pp_text_proj, text_proj_state_dict)
 
-    pp_super_res_first = UNet2DModel.from_config(
-        diffusers_pipe.super_res_first.config)
+    pp_super_res_first = UNet2DModel.from_config(diffusers_pipe.super_res_first.config)
     pp_super_res_first.set_dict(super_res_first_state_dict)
     check_keys(pp_super_res_first, super_res_first_state_dict)
 
-    pp_super_res_last = UNet2DModel.from_config(
-        diffusers_pipe.super_res_last.config)
+    pp_super_res_last = UNet2DModel.from_config(diffusers_pipe.super_res_last.config)
     pp_super_res_last.set_dict(super_res_last_state_dict)
     check_keys(pp_super_res_last, super_res_last_state_dict)
 
-    pp_text_encoder = CLIPTextModelWithProjection(
-        CLIPTextConfig.from_dict(text_config))
+    pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
     check_keys(pp_text_encoder, text_encoder_state_dict)
 
-    pp_prior_scheduler = UnCLIPScheduler.from_config(
-        diffusers_pipe.prior_scheduler.config)
-    pp_decoder_scheduler = UnCLIPScheduler.from_config(
-        diffusers_pipe.decoder_scheduler.config)
-    pp_super_res_scheduler = UnCLIPScheduler.from_config(
-        diffusers_pipe.super_res_scheduler.config)
+    pp_prior_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.prior_scheduler.config)
+    pp_decoder_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.decoder_scheduler.config)
+    pp_super_res_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.super_res_scheduler.config)
 
     with tempfile.TemporaryDirectory() as tmpdirname:
         # 5. feature_extractor
@@ -209,15 +196,15 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
             super_res_last=pp_super_res_last,
             prior_scheduler=pp_prior_scheduler,
             decoder_scheduler=pp_decoder_scheduler,
-            super_res_scheduler=pp_super_res_scheduler, )
+            super_res_scheduler=pp_super_res_scheduler,
+        )
         # 6. save_pretrained
         paddle_pipe.save_pretrained(output_path)
     return paddle_pipe
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -228,7 +215,7 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path,
         "--output_path",
         type=str,
         default="./karlo-v1-alpha",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+    ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
index eb8c950cc052e..d5c0fad1746bf 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
@@ -23,8 +23,7 @@
 from ppdiffusers import Transformer2DModel
 from ppdiffusers import VQDiffusionPipeline as PPDiffusersVQDiffusionPipeline
 from ppdiffusers import VQDiffusionScheduler, VQModel
-from ppdiffusers.pipelines.vq_diffusion import \
-    LearnedClassifierFreeSamplingEmbeddings
+from ppdiffusers.pipelines.vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
 
 paddle.set_device("cpu")
 
@@ -62,9 +61,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
 
     for name, value in clip.state_dict().items():
         # step1: ignore position_ids
@@ -78,7 +75,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         if "vision_model" in name:
             name = "clip." + name
@@ -103,20 +100,17 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
             "vision_heads": clip.config.vision_config.num_attention_heads,
             "vision_embed_dim": clip.config.vision_config.hidden_size,
             "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size //
-            clip.config.vision_config.hidden_size,
+            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
             "vision_hidden_act": clip.config.vision_config.hidden_act,
             "projection_dim": clip.config.projection_dim,
         }
     return new_model_state, new_config
 
 
-def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
-                                                  output_path=None):
+def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
 
     # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True)
+    diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
 
     # 1. vqvae
     vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae)
@@ -124,35 +118,33 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
     transformer_state_dict = convert_to_ppdiffusers(diffusers_pipe.transformer)
     # 3. learned_classifier_free_sampling_embeddings
     learned_classifier_free_sampling_embeddings_state_dict = convert_to_ppdiffusers(
-        diffusers_pipe.learned_classifier_free_sampling_embeddings)
+        diffusers_pipe.learned_classifier_free_sampling_embeddings
+    )
     # 4.text_encoder
     text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True)
+        diffusers_pipe.text_encoder, is_text_encoder=True
+    )
 
     # 1. vqvae
     pp_vqvae = VQModel.from_config(diffusers_pipe.vqvae.config)
     pp_vqvae.set_dict(vqvae_state_dict)
 
     # 2. transformer
-    pp_transformer = Transformer2DModel.from_config(
-        diffusers_pipe.transformer.config)
+    pp_transformer = Transformer2DModel.from_config(diffusers_pipe.transformer.config)
     pp_transformer.set_dict(transformer_state_dict)
 
     # 3. pp_learned_classifier_free_sampling_embeddings
-    pp_learned_classifier_free_sampling_embeddings = (
-        LearnedClassifierFreeSamplingEmbeddings.from_config(
-            diffusers_pipe.learned_classifier_free_sampling_embeddings.config))
-    pp_learned_classifier_free_sampling_embeddings.set_dict(
-        learned_classifier_free_sampling_embeddings_state_dict)
+    pp_learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings.from_config(
+        diffusers_pipe.learned_classifier_free_sampling_embeddings.config
+    )
+    pp_learned_classifier_free_sampling_embeddings.set_dict(learned_classifier_free_sampling_embeddings_state_dict)
 
     # 4. text_encoder
-    pp_text_encoder = CLIPTextModel(
-        CLIPTextConfig.from_dict(text_encoder_config))
+    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
     pp_text_encoder.set_dict(text_encoder_state_dict)
 
     # 5. scheduler
-    pp_scheduler = VQDiffusionScheduler.from_config(
-        diffusers_pipe.scheduler.config)
+    pp_scheduler = VQDiffusionScheduler.from_config(diffusers_pipe.scheduler.config)
 
     with tempfile.TemporaryDirectory() as tmpdirname:
         # 6. tokenizer
@@ -166,7 +158,8 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
             tokenizer=pp_tokenizer,
             transformer=pp_transformer,
             learned_classifier_free_sampling_embeddings=pp_learned_classifier_free_sampling_embeddings,
-            scheduler=pp_scheduler, )
+            scheduler=pp_scheduler,
+        )
 
         # 8. save_pretrained
         paddle_pipe.save_pretrained(output_path)
@@ -174,8 +167,7 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Pytorch model weights to Paddle model weights.")
+    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -186,7 +178,9 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path,
         "--output_path",
         type=str,
         default="microsoft/vq-diffusion-ithq-ppdiffusers",
-        help="The model output path.", )
+        help="The model output path.",
+    )
     args = parser.parse_args()
     ppdiffusers_pipe = convert_diffusers_vq_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path)
+        args.pretrained_model_name_or_path, args.output_path
+    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
index 41b5460d10922..b57a9ef31149d 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
@@ -30,10 +30,17 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler,
-    StableDiffusionPipeline, UNet2DConditionModel)
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 
 paddle.set_device("cpu")
 MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
@@ -116,8 +123,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
     FILENAME = f"archive/{file_name}".encode("latin")
     padding_size_plus_fbxx = 4 + 14
     data_iostream = []
-    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(
-        FILENAME) + padding_size_plus_fbxx
+    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
     with open(file, "rb") as r:
         r.seek(offset)
         for bytes_data in io.BytesIO(r.read()):
@@ -130,8 +136,7 @@ def get_data_iostream(file: str, file_name="data.pkl"):
     return out, offset + len(out)
 
 
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad,
-                          backward_hooks):
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
     if isinstance(storage, TensorMeta):
         storage.size = size
     return storage
@@ -169,8 +174,7 @@ def persistent_load_stage1(saved_id):
 
     data_iostream, pre_offset = get_data_iostream(path, file_name="data.pkl")
     # 1. read the structure of storage
-    unpickler_stage1 = UnpicklerWrapperStage(
-        io.BytesIO(data_iostream), **pickle_load_args)
+    unpickler_stage1 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
     unpickler_stage1.persistent_load = persistent_load_stage1
     result_stage1 = unpickler_stage1.load()
 
@@ -202,17 +206,15 @@ def extract_maybe_dict(result):
             # `MZ_ZIP_LOCAL_DIR_HEADER_SIZE` is from: https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/inline_container.cc#L186
             # `16` is the fixed characters size from binary file.
             # `filename_with_fb` is the length of dynamic data key name
-            file_handler.seek(
-                MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1)
+            file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1)
 
-            padding_offset = np.frombuffer(
-                file_handler.read(2)[:1], dtype=np.uint8)[0]
+            padding_offset = np.frombuffer(file_handler.read(2)[:1], dtype=np.uint8)[0]
             file_handler.read(padding_offset)
 
             # save the tensor info in result to re-use memory
             stage1_key_to_tensor[key] = np.frombuffer(
-                file_handler.read(tensor_meta.nbytes),
-                dtype=tensor_meta.dtype).reshape(tensor_meta.size)
+                file_handler.read(tensor_meta.nbytes), dtype=tensor_meta.dtype
+            ).reshape(tensor_meta.size)
 
     def persistent_load_stage2(saved_id):
         assert isinstance(saved_id, tuple)
@@ -220,8 +222,7 @@ def persistent_load_stage2(saved_id):
         return stage1_key_to_tensor[key]
 
     # 4. read the structure of storage
-    unpickler_stage2 = UnpicklerWrapperStage(
-        io.BytesIO(data_iostream), **pickle_load_args)
+    unpickler_stage2 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
     unpickler_stage2.persistent_load = persistent_load_stage2
     result_stage2 = unpickler_stage2.load()
 
@@ -253,8 +254,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("emb_layers.1", "time_emb_proj")
         new_item = new_item.replace("skip_connection", "conv_shortcut")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -270,8 +270,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -313,8 +312,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -322,12 +320,13 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
     to them. It splits attention layers, and takes into account additional replacements
@@ -335,9 +334,7 @@ def assign_to_checkpoint(
 
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -345,13 +342,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
 
             query, key, value = np.split(old_tensor, 3, axis=1)
 
@@ -363,8 +358,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -374,8 +368,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -403,34 +396,28 @@ def create_unet_diffusers_config(original_config, image_size: int):
     unet_params = original_config.model.params.unet_config.params
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
-    block_out_channels = [
-        unet_params.model_channels * mult for mult in unet_params.channel_mult
-    ]
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnDownBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "DownBlock2D")
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnUpBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "UpBlock2D")
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2**(len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
 
     head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (unet_params.use_linear_in_transformer
-                             if "use_linear_in_transformer" in unet_params else
-                             False)
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
     if use_linear_projection:
         # stable diffusion 2-base-512 and 2-768
         if head_dim is None:
@@ -446,7 +433,8 @@ def create_unet_diffusers_config(original_config, image_size: int):
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=unet_params.context_dim,
         attention_head_dim=head_dim,
-        use_linear_projection=use_linear_projection, )
+        use_linear_projection=use_linear_projection,
+    )
 
     return config
 
@@ -470,7 +458,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks, )
+        layers_per_block=vae_params.num_res_blocks,
+    )
     return config
 
 
@@ -479,14 +468,12 @@ def create_diffusers_schedular(original_config):
         num_train_timesteps=original_config.model.params.timesteps,
         beta_start=original_config.model.params.linear_start,
         beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear", )
+        beta_schedule="scaled_linear",
+    )
     return schedular
 
 
-def convert_ldm_unet_checkpoint(checkpoint,
-                                config,
-                                path=None,
-                                extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -507,8 +494,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
             for key in keys:
                 if key.startswith("model.diffusion_model"):
                     flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                        flat_ema_key)
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
         else:
             print(
                 "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -521,17 +507,12 @@ def convert_ldm_unet_checkpoint(checkpoint,
 
     new_checkpoint = {}
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
-        "time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
-        "time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
-        "time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
-        "time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict[
-        "input_blocks.0.0.weight"]
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
     new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -540,35 +521,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
     new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "input_blocks" in layer
-    })
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
         for layer_id in range(num_input_blocks)
     }
 
     # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "middle_block" in layer
-    })
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
     middle_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
         for layer_id in range(num_middle_blocks)
     }
 
     # Retrieves the keys for the output blocks only
-    num_output_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "output_blocks" in layer
-    })
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
     output_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
         for layer_id in range(num_output_blocks)
     }
 
@@ -577,21 +546,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
 
         resnets = [
-            key for key in input_blocks[i]
-            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
-            key
-        ]
-        attentions = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
         ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
 
         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.weight")
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.bias")
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
 
         paths = renew_resnet_paths(resnets)
         meta_path = {
@@ -603,7 +568,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
             new_checkpoint,
             unet_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
@@ -616,19 +582,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
     resnet_1 = middle_blocks[2]
 
     resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(
-        resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
 
     resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(
-        resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
     meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -637,14 +602,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
         new_checkpoint,
         unet_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
 
     for i in range(num_output_blocks):
         block_id = i // (config["layers_per_block"] + 1)
         layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [
-            shave_segments(name, 2) for name in output_blocks[i]
-        ]
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
@@ -655,12 +619,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 output_block_list[layer_id] = [layer_name]
 
         if len(output_block_list) > 1:
-            resnets = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
-            ]
-            attentions = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
-            ]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
 
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
@@ -674,31 +634,30 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
             if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(
-                    ["conv.weight", "conv.bias"])
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.weight"]
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.bias"]
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
                     attentions = []
 
             if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(
-                    ["conv.bias", "conv.weight"])
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.weight"]
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.bias"]
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -708,27 +667,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 paths = renew_attention_paths(attentions)
                 meta_path = {
                     "old": f"output_blocks.{i}.1",
-                    "new":
-                    f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                 }
                 assign_to_checkpoint(
                     paths,
                     new_checkpoint,
                     unet_state_dict,
                     additional_replacements=[meta_path],
-                    config=config, )
+                    config=config,
+                )
         else:
-            resnet_0_paths = renew_resnet_paths(
-                output_block_layers, n_shave_prefix_segments=1)
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
                 old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join([
-                    "up_blocks",
-                    str(block_id),
-                    "resnets",
-                    str(layer_in_block_id),
-                    path["new"],
-                ])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
@@ -746,107 +706,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -854,58 +781,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -913,14 +832,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint,
-                                              dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -955,7 +873,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
     clip = {}
     for key in checkpoint.keys():
         if key.startswith("cond_stage_model.transformer"):
-            clip[key[len("cond_stage_model.transformer."):]] = checkpoint[key]
+            clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
 
     new_model_state = {}
     transformers2ppnlp = {
@@ -975,9 +893,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
     for name, value in clip.items():
         # step1: ignore position_ids
         if any(i in name for i in ignore_value):
@@ -990,17 +906,14 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
 
         new_model_state[name] = value.astype(dtype)
 
     new_config = {
-        "max_text_length":
-        new_model_state["text_model.positional_embedding.weight"].shape[0],
-        "vocab_size":
-        new_model_state["text_model.token_embedding.weight"].shape[0],
-        "text_embed_dim":
-        new_model_state["text_model.token_embedding.weight"].shape[1],
+        "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0],
+        "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0],
+        "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1],
         "text_heads": 12,
         "text_layers": 12,
         "text_hidden_act": "quick_gelu",
@@ -1019,7 +932,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -1045,13 +959,15 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
             " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
+        help="Path to the output model.",
+    )
     args = parser.parse_args()
 
     image_size = 512
@@ -1061,14 +977,14 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
     if args.original_config_file is None:
         get_path_from_url(
             "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/v1-inference.yaml",
-            root_dir="./", )
+            root_dir="./",
+        )
         args.original_config_file = "./v1-inference.yaml"
 
     original_config = OmegaConf.load(args.original_config_file)
 
     if args.num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"][
-            "in_channels"] = args.num_in_channels
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
 
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
@@ -1081,7 +997,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         num_train_timesteps=num_train_timesteps,
         steps_offset=1,
         clip_sample=False,
-        set_alpha_to_one=False, )
+        set_alpha_to_one=False,
+    )
     # make sure scheduler works correctly with DDIM
     scheduler.register_to_config(clip_sample=False)
 
@@ -1096,44 +1013,37 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
     elif args.scheduler_type == "euler":
         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(
-            scheduler.config)
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "dpm":
         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
     elif args.scheduler_type == "ddim":
         scheduler = scheduler
     else:
-        raise ValueError(
-            f"Scheduler of type {args.scheduler_type} doesn't exist!")
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
 
     # 1. Convert the UNet2DConditionModel model.
-    diffusers_unet_config = create_unet_diffusers_config(
-        original_config, image_size=image_size)
+    diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     diffusers_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint,
         diffusers_unet_config,
         path=args.checkpoint_path,
-        extract_ema=args.extract_ema, )
+        extract_ema=args.extract_ema,
+    )
     unet = UNet2DConditionModel.from_config(diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        unet, diffusers_unet_checkpoint)
+    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
     check_keys(unet, ppdiffusers_unet_checkpoint)
     unet.load_dict(ppdiffusers_unet_checkpoint)
 
     # 2. Convert the VAE model.
-    vae_config = create_vae_diffusers_config(
-        original_config, image_size=image_size)
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
-                                                          vae_config)
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     vae = AutoencoderKL.from_config(vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        vae, diffusers_vae_checkpoint)
+    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
     check_keys(vae, ppdiffusers_vae_checkpoint)
     vae.load_dict(ppdiffusers_vae_checkpoint)
 
     # 3. Convert the text_encoder model.
-    text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
-        checkpoint, dtype="float32")
+    text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32")
     text_model = CLIPTextModel(CLIPTextConfig.from_dict(text_config))
     text_model.eval()
     check_keys(text_model, text_model_state_dict)
@@ -1150,5 +1060,6 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
         scheduler=scheduler,
         safety_checker=None,
         feature_extractor=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
index 96786f7bd3255..55fd755445702 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
@@ -27,10 +27,15 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from transformers import CLIPTextModel as HFCLIPTextModel
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 
 paddle.set_device("cpu")
 
@@ -60,8 +65,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("emb_layers.1", "time_emb_proj")
         new_item = new_item.replace("skip_connection", "conv_shortcut")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -77,8 +81,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
         new_item = old_item
 
         new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -120,8 +123,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
         new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
         new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
 
-        new_item = shave_segments(
-            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
 
         mapping.append({"old": old_item, "new": new_item})
 
@@ -129,21 +131,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
 
 
 def assign_to_checkpoint(
-        paths,
-        checkpoint,
-        old_checkpoint,
-        attention_paths_to_split=None,
-        additional_replacements=None,
-        config=None, ):
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming
     to them. It splits attention layers, and takes into account additional replacements
     that may arise.
     Assigns the weights to the new checkpoint.
     """
-    assert isinstance(
-        paths,
-        list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
 
     # Splits the attention layers into three variables.
     if attention_paths_to_split is not None:
@@ -151,13 +152,11 @@ def assign_to_checkpoint(
             old_tensor = old_checkpoint[path]
             channels = old_tensor.shape[0] // 3
 
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (
-                -1)
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
 
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
 
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
-                                             num_heads) + old_tensor.shape[1:])
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
             query, key, value = old_tensor.split(channels // num_heads, dim=1)
 
             checkpoint[path_map["query"]] = query.reshape(target_shape)
@@ -168,8 +167,7 @@ def assign_to_checkpoint(
         new_path = path["new"]
 
         # These have already been assigned
-        if (attention_paths_to_split is not None and
-                new_path in attention_paths_to_split):
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
             continue
 
         # Global renaming happens here
@@ -179,8 +177,7 @@ def assign_to_checkpoint(
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"],
-                                            replacement["new"])
+                new_path = new_path.replace(replacement["old"], replacement["new"])
 
         # proj_attn.weight has to be converted from conv 1D to linear
         if "proj_attn.weight" in new_path:
@@ -207,25 +204,19 @@ def create_unet_diffusers_config(original_config):
     """
     unet_params = original_config.model.params.unet_config.params
 
-    block_out_channels = [
-        unet_params.model_channels * mult for mult in unet_params.channel_mult
-    ]
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnDownBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "DownBlock2D")
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = ("CrossAttnUpBlock2D"
-                      if resolution in unet_params.attention_resolutions else
-                      "UpBlock2D")
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
@@ -242,7 +233,8 @@ def create_unet_diffusers_config(original_config):
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=attention_head_dim, )
+        attention_head_dim=attention_head_dim,
+    )
 
     return config
 
@@ -266,14 +258,12 @@ def create_vae_diffusers_config(original_config):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks, )
+        layers_per_block=vae_params.num_res_blocks,
+    )
     return config
 
 
-def convert_ldm_unet_checkpoint(checkpoint,
-                                config,
-                                path=None,
-                                extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -294,8 +284,7 @@ def convert_ldm_unet_checkpoint(checkpoint,
             for key in keys:
                 if key.startswith("model.diffusion_model"):
                     flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(
-                        flat_ema_key)
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
         else:
             print(
                 "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
@@ -308,17 +297,12 @@ def convert_ldm_unet_checkpoint(checkpoint,
 
     new_checkpoint = {}
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
-        "time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
-        "time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
-        "time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
-        "time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict[
-        "input_blocks.0.0.weight"]
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
     new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -327,35 +311,23 @@ def convert_ldm_unet_checkpoint(checkpoint,
     new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
-    num_input_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "input_blocks" in layer
-    })
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
         for layer_id in range(num_input_blocks)
     }
 
     # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "middle_block" in layer
-    })
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
     middle_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
         for layer_id in range(num_middle_blocks)
     }
 
     # Retrieves the keys for the output blocks only
-    num_output_blocks = len({
-        ".".join(layer.split(".")[:2])
-        for layer in unet_state_dict if "output_blocks" in layer
-    })
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
     output_blocks = {
-        layer_id:
-        [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
         for layer_id in range(num_output_blocks)
     }
 
@@ -364,21 +336,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
 
         resnets = [
-            key for key in input_blocks[i]
-            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in
-            key
-        ]
-        attentions = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.1" in key
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
         ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
 
         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.weight")
-            new_checkpoint[
-                f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                    f"input_blocks.{i}.0.op.bias")
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
 
         paths = renew_resnet_paths(resnets)
         meta_path = {
@@ -390,7 +358,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
             new_checkpoint,
             unet_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
@@ -403,19 +372,18 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
     resnet_0 = middle_blocks[0]
     attentions = middle_blocks[1]
     resnet_1 = middle_blocks[2]
 
     resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(
-        resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
 
     resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(
-        resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
 
     attentions_paths = renew_attention_paths(attentions)
     meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
@@ -424,14 +392,13 @@ def convert_ldm_unet_checkpoint(checkpoint,
         new_checkpoint,
         unet_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
 
     for i in range(num_output_blocks):
         block_id = i // (config["layers_per_block"] + 1)
         layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [
-            shave_segments(name, 2) for name in output_blocks[i]
-        ]
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
         output_block_list = {}
 
         for layer in output_block_layers:
@@ -442,12 +409,8 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 output_block_list[layer_id] = [layer_name]
 
         if len(output_block_list) > 1:
-            resnets = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.0" in key
-            ]
-            attentions = [
-                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
-            ]
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
 
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
@@ -461,17 +424,17 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 new_checkpoint,
                 unet_state_dict,
                 additional_replacements=[meta_path],
-                config=config, )
+                config=config,
+            )
 
             if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(
-                    ["conv.weight", "conv.bias"])
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.weight"]
-                new_checkpoint[
-                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                        f"output_blocks.{i}.{index}.conv.bias"]
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -481,27 +444,28 @@ def convert_ldm_unet_checkpoint(checkpoint,
                 paths = renew_attention_paths(attentions)
                 meta_path = {
                     "old": f"output_blocks.{i}.1",
-                    "new":
-                    f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
                 }
                 assign_to_checkpoint(
                     paths,
                     new_checkpoint,
                     unet_state_dict,
                     additional_replacements=[meta_path],
-                    config=config, )
+                    config=config,
+                )
         else:
-            resnet_0_paths = renew_resnet_paths(
-                output_block_layers, n_shave_prefix_segments=1)
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
                 old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join([
-                    "up_blocks",
-                    str(block_id),
-                    "resnets",
-                    str(layer_in_block_id),
-                    path["new"],
-                ])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
@@ -519,107 +483,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
     new_checkpoint = {}
 
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
-        "encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
-        "encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
-        "encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
-        "encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
-        "encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
-        "encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
-        "decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
-        "decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
-        "decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
-        "decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
-        "decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
-        "decoder.norm_out.bias"]
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
 
     new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
     new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
-        "post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
-        "post_quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
 
     # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "encoder.down" in layer
-    })
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
     down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
-        for layer_id in range(num_down_blocks)
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
     }
 
     # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({
-        ".".join(layer.split(".")[:3])
-        for layer in vae_state_dict if "decoder.up" in layer
-    })
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
     up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
-        for layer_id in range(num_up_blocks)
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
     }
 
     for i in range(num_down_blocks):
-        resnets = [
-            key for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key
-        ]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.weight")
-            new_checkpoint[
-                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                    f"encoder.down.{i}.downsample.conv.bias")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"down.{i}.block",
-            "new": f"down_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "encoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -627,58 +558,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
 
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
         ]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.weight"]
-            new_checkpoint[
-                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                    f"decoder.up.{block_id}.upsample.conv.bias"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"up.{block_id}.block",
-            "new": f"up_blocks.{i}.resnets"
-        }
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
-        resnets = [
-            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
-        ]
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
 
         paths = renew_vae_resnet_paths(resnets)
-        meta_path = {
-            "old": f"mid.block_{i}",
-            "new": f"mid_block.resnets.{i - 1}"
-        }
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
         assign_to_checkpoint(
             paths,
             new_checkpoint,
             vae_state_dict,
             additional_replacements=[meta_path],
-            config=config, )
+            config=config,
+        )
 
-    mid_attentions = [
-        key for key in vae_state_dict if "decoder.mid.attn" in key
-    ]
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
     paths = renew_vae_attention_paths(mid_attentions)
     meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(
@@ -686,14 +609,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
         new_checkpoint,
         vae_state_dict,
         additional_replacements=[meta_path],
-        config=config, )
+        config=config,
+    )
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
 
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet,
-                                              diffusers_vae_unet_checkpoint,
-                                              dtype="float32"):
+def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
     need_transpose = []
     for k, v in vae_or_unet.named_sublayers(include_self=True):
         if isinstance(v, paddle.nn.Linear):
@@ -745,9 +667,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
         ".vision_model.": ".",
     }
     ignore_value = ["position_ids"]
-    donot_transpose = [
-        "embeddings", "norm", "concept_embeds", "special_care_embeds"
-    ]
+    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
     for name, value in clip.state_dict().items():
         if f".{layer_need_to_ignore}." in name:
             continue
@@ -762,7 +682,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
             name = name.replace(hf_name, ppnlp_name)
         # step4: 0d tensor -> 1d tensor
         if name == "logit_scale":
-            value = value.reshape((1, ))
+            value = value.reshape((1,))
         # step5: safety_checker need prefix "clip."
         new_model_state[name] = value.cpu().numpy().astype(dtype)
 
@@ -788,7 +708,8 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
         default=None,
         type=str,
         required=True,
-        help="Path to the checkpoint to convert.", )
+        help="Path to the checkpoint to convert.",
+    )
     parser.add_argument(
         "--original_config_file",
         default="v2-inference.yaml",
@@ -809,13 +730,15 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
             " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ), )
+        ),
+    )
     parser.add_argument(
         "--dump_path",
         default=None,
         type=str,
         required=True,
-        help="Path to the output model.", )
+        help="Path to the output model.",
+    )
 
     args = parser.parse_args()
 
@@ -836,26 +759,23 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
         checkpoint,
         diffusers_unet_config,
         path=args.checkpoint_path,
-        extract_ema=args.extract_ema, )
+        extract_ema=args.extract_ema,
+    )
     unet = UNet2DConditionModel(**diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        unet, diffusers_unet_checkpoint)
+    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
     check_keys(unet, ppdiffusers_unet_checkpoint)
     unet.load_dict(ppdiffusers_unet_checkpoint)
 
     # 2. Convert the VAE model.
     vae_config = create_vae_diffusers_config(original_config)
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint,
-                                                          vae_config)
+    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
     vae = AutoencoderKL(**vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(
-        vae, diffusers_vae_checkpoint)
+    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
     check_keys(vae, ppdiffusers_vae_checkpoint)
     vae.load_dict(ppdiffusers_vae_checkpoint)
 
     # 3. Convert the text model.
-    text_model_type = original_config.model.params.cond_stage_config.target.split(
-        ".")[-1]
+    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
     layer = original_config.model.params.cond_stage_config.params.layer
     if layer == "last":
         layer_idx = 0
@@ -867,19 +787,16 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
     if text_model_type != "FrozenOpenCLIPEmbedder":
         print("We only support FrozenOpenCLIPEmbedder as text_encoder!")
 
-    clip = HFCLIPTextModel.from_pretrained(
-        "laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
-    ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip(
-        clip, layer_idx)
+    clip = HFCLIPTextModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+    ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip(clip, layer_idx)
 
     text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(clip_config))
     text_encoder.load_dict(ppdiffusers_clip_checkpoint)
 
     # 5. load tokenizer.
     pp_tokenizer = CLIPTokenizer.from_pretrained(
-        "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
-        pad_token="!",
-        model_max_length=77)
+        "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", pad_token="!", model_max_length=77
+    )
 
     # 6. Convert scheduler.
     num_train_timesteps = original_config.model.params.timesteps
@@ -894,17 +811,14 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
             set_alpha_to_one=False,
             steps_offset=1,
             # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True, )
+            skip_prk_steps=True,
+        )
     elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
     elif args.scheduler_type == "euler-ancestral":
         scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear")
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
     elif args.scheduler_type == "ddim":
         scheduler = DDIMScheduler(
             beta_start=beta_start,
@@ -913,10 +827,10 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
             # Make sure the scheduler compatible with DDIM
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
     else:
-        raise ValueError(
-            f"Scheduler of type {args.scheduler_type} doesn't exist!")
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
 
     pipe = StableDiffusionPipeline(
         vae=vae,
@@ -926,6 +840,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
         scheduler=scheduler,
         safety_checker=None,
         feature_extractor=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
 
     pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
index 7caddb24c95d2..b7bed2a4b3b35 100644
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
+++ b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
@@ -19,22 +19,20 @@
 
 import paddle
 
-from ppdiffusers import (FastDeployStableDiffusionInpaintPipeline,
-                         FastDeployStableDiffusionMegaPipeline,
-                         StableDiffusionPipeline)
+from ppdiffusers import (
+    FastDeployStableDiffusionInpaintPipeline,
+    FastDeployStableDiffusionMegaPipeline,
+    StableDiffusionPipeline,
+)
 from ppdiffusers.fastdeploy_utils import FastDeployRuntimeModel
 
 
-def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str,
-                                                        output_path: str,
-                                                        mode: bool=False):
-    pipeline = StableDiffusionPipeline.from_pretrained(
-        model_path, safety_checker=None, feature_extractor=None)
+def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, output_path: str, mode: bool = False):
+    pipeline = StableDiffusionPipeline.from_pretrained(model_path, safety_checker=None, feature_extractor=None)
     output_path = Path(output_path)
 
     # get arguments
-    cross_attention_dim = (
-        pipeline.unet.config.cross_attention_dim)  # 768 or 1024 or 1280
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
     unet_channels = pipeline.unet.config.in_channels  # 4 or 9
     vae_in_channels = pipeline.vae.config.in_channels  # 3
     vae_latent_channels = pipeline.vae.config.latent_channels  # 4
@@ -42,14 +40,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str,
         f"cross_attention_dim: {cross_attention_dim}\n",
         f"unet_in_channels: {unet_channels}\n",
         f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}", )
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
     # 1. Convert text_encoder
     text_encoder = paddle.jit.to_static(
         pipeline.text_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, None], dtype="int64", name="input_ids")
-        ],  # input_ids
+        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
     )
     save_path = os.path.join(args.output_path, "text_encoder", "inference")
     paddle.jit.save(text_encoder, save_path)
@@ -60,17 +56,15 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str,
     unet = paddle.jit.to_static(
         pipeline.unet,
         input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, unet_channels, None, None],
-                dtype="float32",
-                name="sample"),  # sample
-            paddle.static.InputSpec(
-                shape=[1], dtype="int64", name="timestep"),  # timestep
+            paddle.static.InputSpec(shape=[None, unet_channels, None, None], dtype="float32", name="sample"),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="timestep"),  # timestep
             paddle.static.InputSpec(
                 shape=[None, None, cross_attention_dim],
                 dtype="float32",
-                name="encoder_hidden_states", ),  # encoder_hidden_states
-        ], )
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
+        ],
+    )
     save_path = os.path.join(args.output_path, "unet", "inference")
     paddle.jit.save(unet, save_path)
     print(f"Save unet model in {save_path} successfully.")
@@ -87,8 +81,7 @@ def forward_vae_encoder_sample(self, z):
     if mode:
         vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
     else:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample,
-                                         vae_encoder)
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
 
     vae_encoder = paddle.jit.to_static(
         vae_encoder,
@@ -98,7 +91,8 @@ def forward_vae_encoder_sample(self, z):
                 dtype="float32",
                 name="sample",  # N, C, H, W
             ),  # latent
-        ], )
+        ],
+    )
     # Save vae_encoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_encoder", "inference")
     paddle.jit.save(vae_encoder, save_path)
@@ -117,8 +111,10 @@ def forward_vae_decoder(self, z):
             paddle.static.InputSpec(
                 shape=[None, vae_latent_channels, None, None],
                 dtype="float32",
-                name="latent_sample", ),  # latent_sample
-        ], )
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
     # Save vae_decoder in static graph model.
     save_path = os.path.join(args.output_path, "vae_decoder", "inference")
     paddle.jit.save(vae_decoder, save_path)
@@ -131,18 +127,16 @@ def forward_vae_decoder(self, z):
         fd_pipe_cls = FastDeployStableDiffusionMegaPipeline
 
     fastdeploy_pipeline = fd_pipe_cls(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                           "vae_decoder"),
-        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path /
-                                                            "text_encoder"),
+        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
         unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
         tokenizer=pipeline.tokenizer,
         scheduler=pipeline.scheduler,
         safety_checker=None,
         feature_extractor=None,
-        requires_safety_checker=False, )
+        requires_safety_checker=False,
+    )
     fastdeploy_pipeline.save_pretrained(output_path)
     print("FastDeploy pipeline saved to", output_path)
 
@@ -174,17 +168,13 @@ def forward_vae_decoder(self, z):
         required=True,
         help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
     )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Path to the output model.")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
     parser.add_argument(
         "--mode",
         action="store_true",
         default=False,
-        help="Export the vae encoder in mode or sample", )
+        help="Export the vae encoder in mode or sample",
+    )
     args = parser.parse_args()
 
-    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        args.model_path, args.output_path, args.mode)
+    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(args.model_path, args.output_path, args.mode)
diff --git a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
index e8def2f35e60a..6a27ffff944e8 100644
--- a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
+++ b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
@@ -53,9 +53,9 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
     all_text_embeds = []
     all_image_embeds = []
     for text, image_path in tqdm(
-            zip(
-                batchify(texts, batch_size), batchify(images_path, batch_size)),
-            total=math.ceil(len(texts) / batch_size), ):
+        zip(batchify(texts, batch_size), batchify(images_path, batch_size)),
+        total=math.ceil(len(texts) / batch_size),
+    ):
         assert len(text) == len(image_path)
         batch_inputs = processor(
             text=text,
@@ -63,56 +63,52 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
             return_tensors="pd",
             max_length=processor.tokenizer.model_max_length,
             padding="max_length",
-            truncation=True, )
-        text_embeds = model.get_text_features(
-            input_ids=batch_inputs["input_ids"])
-        image_embeds = model.get_image_features(
-            pixel_values=batch_inputs["pixel_values"])
+            truncation=True,
+        )
+        text_embeds = model.get_text_features(input_ids=batch_inputs["input_ids"])
+        image_embeds = model.get_image_features(pixel_values=batch_inputs["pixel_values"])
         all_text_embeds.append(text_embeds)
         all_image_embeds.append(image_embeds)
 
     all_text_embeds = paddle.concat(all_text_embeds)
     all_image_embeds = paddle.concat(all_image_embeds)
-    all_text_embeds = all_text_embeds / all_text_embeds.norm(
-        axis=-1, keepdim=True)
-    all_image_embeds = all_image_embeds / all_image_embeds.norm(
-        axis=-1, keepdim=True)
-    clip_score = (all_image_embeds *
-                  all_text_embeds).sum(-1) * model.logit_scale.exp()
+    all_text_embeds = all_text_embeds / all_text_embeds.norm(axis=-1, keepdim=True)
+    all_image_embeds = all_image_embeds / all_image_embeds.norm(axis=-1, keepdim=True)
+    clip_score = (all_image_embeds * all_text_embeds).sum(-1) * model.logit_scale.exp()
     return clip_score
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--image_path", default=None, nargs="+", type=str, help="image_path")
+    parser.add_argument("--image_path", default=None, nargs="+", type=str, help="image_path")
     parser.add_argument(
         "--output_file",
         default="statistic_results.json",
         type=str,
-        help="output file name", )
+        help="output file name",
+    )
     parser.add_argument(
         "--text_file_name",
         default="coco30k",
         choices=["coco1k", "coco10k", "coco30k"],
         type=str,
-        help="text file.", )
+        help="text file.",
+    )
     parser.add_argument(
         "--clip_model_name_or_path",
         default="openai/clip-vit-base-patch32",
         type=str,
-        help="clip_model_name_or_path", )
-    parser.add_argument(
-        "--fid_batch_size", default=32, type=int, help="fid_batch_size")
-    parser.add_argument(
-        "--clip_batch_size", default=64, type=int, help="clip_batch_size")
-    parser.add_argument(
-        "--resolution", default=256, type=int, help="resolution of images")
+        help="clip_model_name_or_path",
+    )
+    parser.add_argument("--fid_batch_size", default=32, type=int, help="fid_batch_size")
+    parser.add_argument("--clip_batch_size", default=64, type=int, help="clip_batch_size")
+    parser.add_argument("--resolution", default=256, type=int, help="resolution of images")
     parser.add_argument("--device", default="gpu", type=str, help="device")
     parser.add_argument(
         "--only_fid",
         action="store_true",
-        help=("Only eval fid. "), )
+        help=("Only eval fid. "),
+    )
     args = parser.parse_args()
 
     paddle.set_device(args.device)
@@ -127,11 +123,9 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
     else:
         os.environ["FLAG_IMAGE_NUM"] = "1000"
     dataset_name = f"coco_{args.resolution}_{image_num}.npz"
-    fid_target_file = get_path_from_url(base_url + dataset_name,
-                                        cache_path) + ".npz"
+    fid_target_file = get_path_from_url(base_url + dataset_name, cache_path) + ".npz"
 
-    text_file = get_path_from_url(base_url + text_file_name + ".tsv",
-                                  cache_path)
+    text_file = get_path_from_url(base_url + text_file_name + ".tsv", cache_path)
     df = pd.read_csv(text_file, sep="\t")
     texts = df["caption_en"].tolist()
     if not args.only_fid:
@@ -149,18 +143,16 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64):
             [fid_target_file, path],
             batch_size=args.fid_batch_size,
             dims=2048,
-            num_workers=4, )
+            num_workers=4,
+        )
         results["fid"].append(fid_value)
 
         if not args.only_fid:
             # clip score
-            images_path = sorted([
-                image_path
-                for ext in IMAGE_EXTENSIONS
-                for image_path in pathlib.Path(path).glob("*.{}".format(ext))
-            ])
-            clip_score = compute_clip_score(model, processor, texts,
-                                            images_path, args.clip_batch_size)
+            images_path = sorted(
+                [image_path for ext in IMAGE_EXTENSIONS for image_path in pathlib.Path(path).glob("*.{}".format(ext))]
+            )
+            clip_score = compute_clip_score(model, processor, texts, images_path, args.clip_batch_size)
             if "clip_score" not in results:
                 results["clip_score"] = []
             _clip_score = clip_score.mean().item()
diff --git a/ppdiffusers/scripts/fid_clip_score/fid_score.py b/ppdiffusers/scripts/fid_clip_score/fid_score.py
index c73e4597015ad..9c6a81cb351c9 100755
--- a/ppdiffusers/scripts/fid_clip_score/fid_score.py
+++ b/ppdiffusers/scripts/fid_clip_score/fid_score.py
@@ -67,42 +67,37 @@ def tqdm(x):
 from inception import InceptionV3
 
 parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument(
-    "--batch-size", type=int, default=50, help="Batch size to use")
-parser.add_argument(
-    "--resolution", type=int, default=None, help="The resolution to resize.")
+parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use")
+parser.add_argument("--resolution", type=int, default=None, help="The resolution to resize.")
 parser.add_argument(
     "--num-workers",
     type=int,
-    help=("Number of processes to use for data loading. "
-          "Defaults to `min(8, num_cpus)`"), )
-parser.add_argument(
-    "--device",
-    type=str,
-    default=None,
-    help="Device to use. Like cuda, cuda:0 or cpu")
+    help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
+)
+parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu")
 parser.add_argument(
     "--dims",
     type=int,
     default=2048,
     choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
-    help=("Dimensionality of Inception features to use. "
-          "By default, uses pool3 features"), )
+    help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"),
+)
 parser.add_argument(
     "--save-stats",
     action="store_true",
-    help=("Generate an npz archive from a directory of samples. "
-          "The first path is used as input and the second as output."), )
+    help=(
+        "Generate an npz archive from a directory of samples. "
+        "The first path is used as input and the second as output."
+    ),
+)
 parser.add_argument(
     "path",
     type=str,
     nargs=2,
-    help=("Paths to the generated images or "
-          "to .npz statistic files"), )
+    help=("Paths to the generated images or " "to .npz statistic files"),
+)
 
-IMAGE_EXTENSIONS = {
-    "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"
-}
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
 
 
 class ImagePathDataset(paddle.io.Dataset):
@@ -125,12 +120,7 @@ def __getitem__(self, i):
         return {"img": img}
 
 
-def get_activations(files,
-                    model,
-                    batch_size=50,
-                    dims=2048,
-                    num_workers=1,
-                    resolution=None):
+def get_activations(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None):
     """Calculates the activations of the pool_3 layer for all images.
 
     Params:
@@ -152,18 +142,17 @@ def get_activations(files,
     model.eval()
 
     if batch_size > len(files):
-        print(("Warning: batch size is bigger than the data size. "
-               "Setting batch size to data size"))
+        print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size"))
         batch_size = len(files)
 
-    dataset = ImagePathDataset(
-        files, transforms=TF.ToTensor(), resolution=resolution)
+    dataset = ImagePathDataset(files, transforms=TF.ToTensor(), resolution=resolution)
     dataloader = paddle.io.DataLoader(
         dataset,
         batch_size=batch_size,
         shuffle=False,
         drop_last=False,
-        num_workers=num_workers, )
+        num_workers=num_workers,
+    )
 
     pred_arr = np.empty((len(files), dims))
 
@@ -181,7 +170,7 @@ def get_activations(files,
 
         pred = pred.squeeze(3).squeeze(2).cpu().numpy()
 
-        pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
 
         start_idx = start_idx + pred.shape[0]
 
@@ -216,18 +205,15 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
     sigma1 = np.atleast_2d(sigma1)
     sigma2 = np.atleast_2d(sigma2)
 
-    assert (mu1.shape == mu2.shape
-            ), "Training and test mean vectors have different lengths"
-    assert (sigma1.shape == sigma2.shape
-            ), "Training and test covariances have different dimensions"
+    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
+    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
 
     diff = mu1 - mu2
 
     # Product might be almost singular
     covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
     if not np.isfinite(covmean).all():
-        msg = ("fid calculation produces singular product; "
-               "adding %s to diagonal of cov estimates") % eps
+        msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
         print(msg)
         offset = np.eye(sigma1.shape[0]) * eps
         covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
@@ -244,12 +230,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
     return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
 
 
-def calculate_activation_statistics(files,
-                                    model,
-                                    batch_size=50,
-                                    dims=2048,
-                                    num_workers=1,
-                                    resolution=None):
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None):
     """Calculation of the statistics used by the FID.
     Params:
     -- files       : List of image files paths
@@ -266,43 +247,28 @@ def calculate_activation_statistics(files,
     -- sigma : The covariance matrix of the activations of the pool_3 layer of
                the inception model.
     """
-    act = get_activations(
-        files, model, batch_size, dims, num_workers, resolution=resolution)
+    act = get_activations(files, model, batch_size, dims, num_workers, resolution=resolution)
     mu = np.mean(act, axis=0)
     sigma = np.cov(act, rowvar=False)
     return mu, sigma
 
 
-def compute_statistics_of_path(path,
-                               model,
-                               batch_size,
-                               dims,
-                               num_workers=1,
-                               resolution=None):
+def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1, resolution=None):
     if path.endswith(".npz"):
         with np.load(path) as f:
             m, s = f["mu"][:], f["sigma"][:]
     else:
         path = pathlib.Path(path)
-        files = sorted([
-            file
-            for ext in IMAGE_EXTENSIONS
-            for file in path.glob("*.{}".format(ext))
-        ])
+        files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))])
         FLAG_IMAGE_NUM = os.getenv("FLAG_IMAGE_NUM", None)
         if FLAG_IMAGE_NUM is not None:
-            files = files[:int(FLAG_IMAGE_NUM)]
-        m, s = calculate_activation_statistics(
-            files, model, batch_size, dims, num_workers, resolution=resolution)
+            files = files[: int(FLAG_IMAGE_NUM)]
+        m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers, resolution=resolution)
 
     return m, s
 
 
-def calculate_fid_given_paths(paths,
-                              batch_size,
-                              dims,
-                              num_workers=1,
-                              resolution=None):
+def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1, resolution=None):
     """Calculates the FID of two paths"""
     for p in paths:
         if not os.path.exists(p):
@@ -312,11 +278,9 @@ def calculate_fid_given_paths(paths,
 
     model = InceptionV3([block_idx])
 
-    m1, s1 = compute_statistics_of_path(
-        paths[0], model, batch_size, dims, num_workers, resolution=resolution)
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution)
 
-    m2, s2 = compute_statistics_of_path(
-        paths[1], model, batch_size, dims, num_workers, resolution=resolution)
+    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers, resolution=resolution)
 
     fid_value = calculate_frechet_distance(m1, s1, m2, s2)
 
@@ -337,8 +301,7 @@ def save_fid_stats(paths, batch_size, dims, num_workers=1, resolution=None):
 
     print(f"Saving statistics for {paths[0]}")
 
-    m1, s1 = compute_statistics_of_path(
-        paths[0], model, batch_size, dims, num_workers, resolution=resolution)
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution)
 
     np.savez_compressed(paths[1], mu=m1, sigma=s1)
 
@@ -367,15 +330,13 @@ def main():
             args.batch_size,
             args.dims,
             num_workers,
-            resolution=args.resolution, )
+            resolution=args.resolution,
+        )
         return
 
     fid_value = calculate_fid_given_paths(
-        args.path,
-        args.batch_size,
-        args.dims,
-        num_workers,
-        resolution=args.resolution)
+        args.path, args.batch_size, args.dims, num_workers, resolution=args.resolution
+    )
     print("FID: ", fid_value)
 
 
diff --git a/ppdiffusers/scripts/fid_clip_score/inception.py b/ppdiffusers/scripts/fid_clip_score/inception.py
index 9aecdf265779a..bbdff9a933432 100644
--- a/ppdiffusers/scripts/fid_clip_score/inception.py
+++ b/ppdiffusers/scripts/fid_clip_score/inception.py
@@ -21,7 +21,8 @@
 # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
 FID_WEIGHTS_URL = (
     "https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams",
-    "8e2ae24c34c5c8b81d45167bb9361f4c", )
+    "8e2ae24c34c5c8b81d45167bb9361f4c",
+)
 WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams"
 
 
@@ -47,17 +48,18 @@ class ConvNormActivation(nn.Sequential):
     """
 
     def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=1,
-            padding=None,
-            groups=1,
-            norm_layer=nn.BatchNorm2D,
-            activation_layer=nn.ReLU,
-            dilation=1,
-            bias=None, ):
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=None,
+        groups=1,
+        norm_layer=nn.BatchNorm2D,
+        activation_layer=nn.ReLU,
+        dilation=1,
+        bias=None,
+    ):
         if padding is None:
             padding = (kernel_size - 1) // 2 * dilation
         if bias is None:
@@ -71,7 +73,8 @@ def __init__(
                 padding,
                 dilation=dilation,
                 groups=groups,
-                bias_attr=bias, )
+                bias_attr=bias,
+            )
         ]
         if norm_layer is not None:
             # The hyperparameter of BatchNorm2D is different from PaddlePaddle.
@@ -97,12 +100,13 @@ class InceptionV3(nn.Layer):
     }
 
     def __init__(
-            self,
-            output_blocks=(DEFAULT_BLOCK_INDEX, ),
-            resize_input=True,
-            normalize_input=True,
-            requires_grad=False,
-            use_fid_inception=True, ):
+        self,
+        output_blocks=(DEFAULT_BLOCK_INDEX,),
+        resize_input=True,
+        normalize_input=True,
+        requires_grad=False,
+        use_fid_inception=True,
+    ):
         """Build pretrained InceptionV3
 
         Parameters
@@ -211,8 +215,7 @@ def forward(self, inp):
         outp = []
         x = inp
         if self.resize_input:
-            x = F.interpolate(
-                x, size=(299, 299), mode="bilinear", align_corners=False)
+            x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
 
         if self.normalize_input:
             x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
@@ -235,8 +238,7 @@ def hack_bn_layer(layer):
 
 def _inception_v3(*args, **kwargs):
     """Wraps `paddle.vision.models.inception_v3`"""
-    return paddle.vision.models.inception_v3(*args,
-                                             **kwargs).apply(hack_bn_layer)
+    return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer)
 
 
 def fid_inception_v3():
@@ -248,8 +250,7 @@ def fid_inception_v3():
     This method first constructs paddle.vision's Inception and then patches the
     necessary parts that are different in the FID Inception model.
     """
-    inception = _inception_v3(
-        num_classes=1008, with_pool=True, pretrained=False)
+    inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False)
     inception.inception_block_list[0] = InceptionA(192, pool_features=32)
     inception.inception_block_list[1] = InceptionA(256, pool_features=64)
     inception.inception_block_list[2] = InceptionA(288, pool_features=64)
@@ -260,8 +261,7 @@ def fid_inception_v3():
     inception.inception_block_list[9] = InceptionE_1(1280)
     inception.inception_block_list[10] = InceptionE_2(2048)
 
-    weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0],
-                                            FID_WEIGHTS_URL[1])
+    weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1])
     state_dict = paddle.load(weight_path)
     inception.set_state_dict(state_dict)
     return inception
@@ -275,49 +275,55 @@ def __init__(self, num_channels, pool_features):
             out_channels=64,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch5x5_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=48,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch5x5_2 = ConvNormActivation(
             in_channels=48,
             out_channels=64,
             kernel_size=5,
             padding=2,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch3x3dbl_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=64,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_2 = ConvNormActivation(
             in_channels=64,
             out_channels=96,
             kernel_size=3,
             padding=1,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_3 = ConvNormActivation(
             in_channels=96,
             out_channels=96,
             kernel_size=3,
             padding=1,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        self.branch_pool = nn.AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=True)
+        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
         self.branch_pool_conv = ConvNormActivation(
             in_channels=num_channels,
             out_channels=pool_features,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -330,8 +336,7 @@ def forward(self, x):
 
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
-        x = paddle.concat(
-            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
         return x
 
 
@@ -343,7 +348,8 @@ def __init__(self, num_channels, channels_7x7):
             out_channels=192,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch7x7_1 = ConvNormActivation(
             in_channels=num_channels,
@@ -351,62 +357,70 @@ def __init__(self, num_channels, channels_7x7):
             kernel_size=1,
             stride=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7_2 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(1, 7),
             stride=1,
             padding=(0, 3),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7_3 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=192,
             kernel_size=(7, 1),
             stride=1,
             padding=(3, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch7x7dbl_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=channels_7x7,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_2 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(7, 1),
             padding=(3, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_3 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(1, 7),
             padding=(0, 3),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_4 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=channels_7x7,
             kernel_size=(7, 1),
             padding=(3, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch7x7dbl_5 = ConvNormActivation(
             in_channels=channels_7x7,
             out_channels=192,
             kernel_size=(1, 7),
             padding=(0, 3),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        self.branch_pool = nn.AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=True)
+        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
         self.branch_pool_conv = ConvNormActivation(
             in_channels=num_channels,
             out_channels=192,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -424,8 +438,7 @@ def forward(self, x):
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
 
         return x
 
@@ -438,61 +451,69 @@ def __init__(self, num_channels):
             out_channels=320,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=384,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3_2a = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(1, 3),
             padding=(0, 1),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3_2b = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(3, 1),
             padding=(1, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         self.branch3x3dbl_1 = ConvNormActivation(
             in_channels=num_channels,
             out_channels=448,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_2 = ConvNormActivation(
             in_channels=448,
             out_channels=384,
             kernel_size=3,
             padding=1,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_3a = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(1, 3),
             padding=(0, 1),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
         self.branch3x3dbl_3b = ConvNormActivation(
             in_channels=384,
             out_channels=384,
             kernel_size=(3, 1),
             padding=(1, 0),
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        self.branch_pool = nn.AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=True)
+        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
         self.branch_pool_conv = ConvNormActivation(
             in_channels=num_channels,
             out_channels=192,
             kernel_size=1,
             padding=0,
-            activation_layer=nn.ReLU, )
+            activation_layer=nn.ReLU,
+        )
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -515,8 +536,7 @@ def forward(self, x):
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
         return x
 
 
@@ -549,6 +569,5 @@ def forward(self, x):
         branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
         return x
diff --git a/ppdiffusers/setup.py b/ppdiffusers/setup.py
index bb412f60fc4f4..a5d0f3cf3b5e9 100644
--- a/ppdiffusers/setup.py
+++ b/ppdiffusers/setup.py
@@ -57,10 +57,7 @@ def read_requirements():
     keywords=["ppdiffusers", "paddle", "paddlemix"],
     install_requires=REQUIRED_PACKAGES,
     python_requires=">=3.6",
-    entry_points={
-        "console_scripts":
-        ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]
-    },
+    entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]},
     classifiers=[
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
@@ -70,4 +67,5 @@ def read_requirements():
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
     ],
-    license="Apache 2.0", )
+    license="Apache 2.0",
+)
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
index 7f987b99141b8..aa10a342c68d4 100644
--- a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
+++ b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
@@ -38,13 +38,14 @@ def __init__(self, unet, scheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            generator: Optional[paddle.Generator]=None,
-            num_inference_steps: int=50,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        generator: Optional[paddle.Generator] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -74,8 +75,10 @@ def __call__(
                 batch_size,
                 self.unet.config.in_channels,
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, ),
-            generator=generator, )
+                self.unet.config.sample_size,
+            ),
+            generator=generator,
+        )
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
@@ -95,6 +98,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, ), "This is a local test"
+            return (image,), "This is a local test"
 
         return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
index d562cd9e580cc..ebdc7650dafd2 100644
--- a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
+++ b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
@@ -38,13 +38,14 @@ def __init__(self, unet, scheduler):
 
     @paddle.no_grad()
     def __call__(
-            self,
-            batch_size: int=1,
-            generator: Optional[paddle.Generator]=None,
-            num_inference_steps: int=50,
-            output_type: Optional[str]="pil",
-            return_dict: bool=True,
-            **kwargs, ) -> Union[ImagePipelineOutput, Tuple]:
+        self,
+        batch_size: int = 1,
+        generator: Optional[paddle.Generator] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Args:
             batch_size (`int`, *optional*, defaults to 1):
@@ -74,8 +75,10 @@ def __call__(
                 batch_size,
                 self.unet.config.in_channels,
                 self.unet.config.sample_size,
-                self.unet.config.sample_size, ),
-            generator=generator, )
+                self.unet.config.sample_size,
+            ),
+            generator=generator,
+        )
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
@@ -95,6 +98,6 @@ def __call__(
             image = self.numpy_to_pil(image)
 
         if not return_dict:
-            return (image, ), "This is a local test"
+            return (image,), "This is a local test"
 
         return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/ppdiffusers/tests/models/test_attention_processor.py b/ppdiffusers/tests/models/test_attention_processor.py
index 84b2d1e9263cb..f47ddfa4abb1d 100644
--- a/ppdiffusers/tests/models/test_attention_processor.py
+++ b/ppdiffusers/tests/models/test_attention_processor.py
@@ -16,12 +16,11 @@
 
 import paddle
 
-from ppdiffusers.models.attention_processor import (Attention,
-                                                    AttnAddedKVProcessor)
+from ppdiffusers.models.attention_processor import Attention, AttnAddedKVProcessor
 
 
 class AttnAddedKVProcessorTests(unittest.TestCase):
-    def get_constructor_arguments(self, only_cross_attention: bool=False):
+    def get_constructor_arguments(self, only_cross_attention: bool = False):
         query_dim = 10
 
         if only_cross_attention:
@@ -59,8 +58,7 @@ def test_only_cross_attention(self):
 
         paddle.seed(0)
 
-        constructor_args = self.get_constructor_arguments(
-            only_cross_attention=False)
+        constructor_args = self.get_constructor_arguments(only_cross_attention=False)
         attn = Attention(**constructor_args)
 
         self.assertTrue(attn.to_k is not None)
@@ -68,7 +66,8 @@ def test_only_cross_attention(self):
 
         forward_args = self.get_forward_arguments(
             query_dim=constructor_args["query_dim"],
-            added_kv_proj_dim=constructor_args["added_kv_proj_dim"], )
+            added_kv_proj_dim=constructor_args["added_kv_proj_dim"],
+        )
 
         self_and_cross_attn_out = attn(**forward_args)
 
@@ -76,8 +75,7 @@ def test_only_cross_attention(self):
 
         paddle.seed(0)
 
-        constructor_args = self.get_constructor_arguments(
-            only_cross_attention=True)
+        constructor_args = self.get_constructor_arguments(only_cross_attention=True)
         attn = Attention(**constructor_args)
 
         self.assertTrue(attn.to_k is None)
@@ -85,7 +83,8 @@ def test_only_cross_attention(self):
 
         forward_args = self.get_forward_arguments(
             query_dim=constructor_args["query_dim"],
-            added_kv_proj_dim=constructor_args["added_kv_proj_dim"], )
+            added_kv_proj_dim=constructor_args["added_kv_proj_dim"],
+        )
 
         only_cross_attn_out = attn(**forward_args)
 
diff --git a/ppdiffusers/tests/models/test_layers_utils.py b/ppdiffusers/tests/models/test_layers_utils.py
index 6bfcd5b37fbab..32480c6e215df 100644
--- a/ppdiffusers/tests/models/test_layers_utils.py
+++ b/ppdiffusers/tests/models/test_layers_utils.py
@@ -19,8 +19,12 @@
 import paddle
 import paddle.nn
 
-from ppdiffusers.models.attention import (GEGLU, AdaLayerNorm, ApproximateGELU,
-                                          AttentionBlock)
+from ppdiffusers.models.attention import (
+    GEGLU,
+    AdaLayerNorm,
+    ApproximateGELU,
+    AttentionBlock,
+)
 from ppdiffusers.models.embeddings import get_timestep_embedding
 from ppdiffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
 from ppdiffusers.models.transformer_2d import Transformer2DModel
@@ -31,8 +35,8 @@ def test_timestep_embeddings(self):
         embedding_dim = 256
         timesteps = paddle.arange(start=16)
         t1 = get_timestep_embedding(timesteps, embedding_dim)
-        assert (t1[0, :embedding_dim // 2] - 0).abs().sum() < 1e-05
-        assert (t1[0, embedding_dim // 2:] - 1).abs().sum() < 1e-05
+        assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-05
+        assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-05
         assert (t1[:, -1] - 1).abs().sum() < 1e-05
         grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1)
         prev_grad = 0.0
@@ -49,72 +53,59 @@ def test_timestep_defaults(self):
             embedding_dim,
             flip_sin_to_cos=False,
             downscale_freq_shift=1,
-            max_period=10000, )
+            max_period=10000,
+        )
         assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01)
 
     def test_timestep_flip_sin_cos(self):
         embedding_dim = 16
         timesteps = paddle.arange(start=10)
-        t1 = get_timestep_embedding(
-            timesteps, embedding_dim, flip_sin_to_cos=True)
-        t1 = paddle.concat(
-            x=[t1[:, embedding_dim // 2:], t1[:, :embedding_dim // 2]], axis=-1)
-        t2 = get_timestep_embedding(
-            timesteps, embedding_dim, flip_sin_to_cos=False)
+        t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True)
+        t1 = paddle.concat(x=[t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], axis=-1)
+        t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False)
         assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01)
 
     def test_timestep_downscale_freq_shift(self):
         embedding_dim = 16
         timesteps = paddle.arange(start=10)
-        t1 = get_timestep_embedding(
-            timesteps, embedding_dim, downscale_freq_shift=0)
-        t2 = get_timestep_embedding(
-            timesteps, embedding_dim, downscale_freq_shift=1)
-        cosine_half = (t1 - t2)[:, embedding_dim // 2:]
+        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0)
+        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1)
+        cosine_half = (t1 - t2)[:, embedding_dim // 2 :]
         assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-05
 
     def test_sinoid_embeddings_hardcoded(self):
         embedding_dim = 64
         timesteps = paddle.arange(start=128)
-        t1 = get_timestep_embedding(
-            timesteps,
-            embedding_dim,
-            downscale_freq_shift=1,
-            flip_sin_to_cos=False)
-        t2 = get_timestep_embedding(
-            timesteps,
-            embedding_dim,
-            downscale_freq_shift=0,
-            flip_sin_to_cos=True)
+        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False)
+        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True)
         t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000)
         assert paddle.allclose(
             t1[23:26, 47:50].flatten().cpu(),
-            paddle.to_tensor([
-                0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769,
-                0.9872
-            ]),
-            atol=0.01, )
+            paddle.to_tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]),
+            atol=0.01,
+        )
         assert paddle.allclose(
             t2[23:26, 47:50].flatten().cpu(),
-            paddle.to_tensor([
-                0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474,
-                0.1864
-            ]),
-            atol=0.01, )
+            paddle.to_tensor([0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474, 0.1864]),
+            atol=0.01,
+        )
         assert paddle.allclose(
             t3[23:26, 47:50].flatten().cpu(),
-            paddle.to_tensor([
-                -0.9801,
-                -0.9464,
-                -0.9349,
-                -0.3952,
-                0.8887,
-                -0.9709,
-                0.5299,
-                -0.2853,
-                -0.9927,
-            ]),
-            atol=0.01, )
+            paddle.to_tensor(
+                [
+                    -0.9801,
+                    -0.9464,
+                    -0.9349,
+                    -0.3952,
+                    0.8887,
+                    -0.9709,
+                    0.5299,
+                    -0.2853,
+                    -0.9927,
+                ]
+            ),
+            atol=0.01,
+        )
 
 
 class Upsample2DBlockTests(unittest.TestCase):
@@ -126,19 +117,20 @@ def test_upsample_default(self):
             upsampled = upsample(sample)
         assert tuple(upsampled.shape) == (1, 32, 64, 64)
         output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -1.50215650,
-            -0.12905766,
-            -0.12905766,
-            -1.97015178,
-            0.78776687,
-            0.78776687,
-            -1.97015178,
-            0.78776687,
-            0.78776687,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -1.50215650,
+                -0.12905766,
+                -0.12905766,
+                -1.97015178,
+                0.78776687,
+                0.78776687,
+                -1.97015178,
+                0.78776687,
+                0.78776687,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_upsample_with_conv(self):
         paddle.seed(0)
@@ -148,19 +140,20 @@ def test_upsample_with_conv(self):
             upsampled = upsample(sample)
         assert tuple(upsampled.shape) == (1, 32, 64, 64)
         output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            0.4583871364593506,
-            -0.8221798539161682,
-            -0.8228907585144043,
-            0.3325321078300476,
-            -0.24422502517700195,
-            1.344732642173767,
-            0.5239212512969971,
-            -0.4814918637275696,
-            0.17928099632263184,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                0.4583871364593506,
+                -0.8221798539161682,
+                -0.8228907585144043,
+                0.3325321078300476,
+                -0.24422502517700195,
+                1.344732642173767,
+                0.5239212512969971,
+                -0.4814918637275696,
+                0.17928099632263184,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_upsample_with_conv_out_dim(self):
         paddle.seed(0)
@@ -170,42 +163,43 @@ def test_upsample_with_conv_out_dim(self):
             upsampled = upsample(sample)
         assert tuple(upsampled.shape) == (1, 64, 64, 64)
         output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            0.9049283266067505,
-            -1.6125869750976562,
-            -1.0837469100952148,
-            0.24520659446716309,
-            -0.6669139266014099,
-            0.5660533905029297,
-            1.1056761741638184,
-            2.1717309951782227,
-            0.7197026610374451,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                0.9049283266067505,
+                -1.6125869750976562,
+                -1.0837469100952148,
+                0.24520659446716309,
+                -0.6669139266014099,
+                0.5660533905029297,
+                1.1056761741638184,
+                2.1717309951782227,
+                0.7197026610374451,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_upsample_with_transpose(self):
         paddle.seed(0)
         sample = paddle.randn(shape=[1, 32, 32, 32])
-        upsample = Upsample2D(
-            channels=32, use_conv=False, use_conv_transpose=True)
+        upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True)
         with paddle.no_grad():
             upsampled = upsample(sample)
         assert tuple(upsampled.shape) == (1, 32, 64, 64)
         output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.05951342731714249,
-            0.26951998472213745,
-            0.2600363492965698,
-            1.12237548828125,
-            -0.07744798064231873,
-            0.006375734228640795,
-            0.6678807735443115,
-            0.44324278831481934,
-            -0.10978640615940094,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.05951342731714249,
+                0.26951998472213745,
+                0.2600363492965698,
+                1.12237548828125,
+                -0.07744798064231873,
+                0.006375734228640795,
+                0.6678807735443115,
+                0.44324278831481934,
+                -0.10978640615940094,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
 
 class Downsample2DBlockTests(unittest.TestCase):
@@ -217,17 +211,19 @@ def test_downsample_default(self):
             downsampled = downsample(sample)
         assert tuple(downsampled.shape) == (1, 32, 32, 32)
         output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.24012964963912964,
-            -0.034197285771369934,
-            -1.0328047275543213,
-            0.7861506938934326,
-            -0.2086063176393509,
-            -0.3999312222003937,
-            0.25081655383110046,
-            -0.23891538381576538,
-            -1.4398303031921387,
-        ])
+        expected_slice = paddle.to_tensor(
+            [
+                -0.24012964963912964,
+                -0.034197285771369934,
+                -1.0328047275543213,
+                0.7861506938934326,
+                -0.2086063176393509,
+                -0.3999312222003937,
+                0.25081655383110046,
+                -0.23891538381576538,
+                -1.4398303031921387,
+            ]
+        )
         max_diff = (output_slice.flatten() - expected_slice).abs().sum().item()
         assert max_diff <= 0.001
 
@@ -239,19 +235,20 @@ def test_downsample_with_conv(self):
             downsampled = downsample(sample)
         assert tuple(downsampled.shape) == (1, 32, 32, 32)
         output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.009430217556655407,
-            0.8657761216163635,
-            1.7985490560531616,
-            -0.61894291639328,
-            -2.5752196311950684,
-            1.2352519035339355,
-            0.6046919822692871,
-            -1.6499173641204834,
-            -1.5272349119186401,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.009430217556655407,
+                0.8657761216163635,
+                1.7985490560531616,
+                -0.61894291639328,
+                -2.5752196311950684,
+                1.2352519035339355,
+                0.6046919822692871,
+                -1.6499173641204834,
+                -1.5272349119186401,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_downsample_with_conv_pad1(self):
         paddle.seed(0)
@@ -261,19 +258,20 @@ def test_downsample_with_conv_pad1(self):
             downsampled = downsample(sample)
         assert tuple(downsampled.shape) == (1, 32, 32, 32)
         output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.009430217556655407,
-            0.8657761216163635,
-            1.7985490560531616,
-            -0.61894291639328,
-            -2.5752196311950684,
-            1.2352519035339355,
-            0.6046919822692871,
-            -1.6499173641204834,
-            -1.5272349119186401,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.009430217556655407,
+                0.8657761216163635,
+                1.7985490560531616,
+                -0.61894291639328,
+                -2.5752196311950684,
+                1.2352519035339355,
+                0.6046919822692871,
+                -1.6499173641204834,
+                -1.5272349119186401,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_downsample_with_conv_out_dim(self):
         paddle.seed(0)
@@ -283,19 +281,20 @@ def test_downsample_with_conv_out_dim(self):
             downsampled = downsample(sample)
         assert tuple(downsampled.shape) == (1, 16, 32, 32)
         output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            0.10819266736507416,
-            0.43043053150177,
-            -0.7322822213172913,
-            -1.923148512840271,
-            1.0195047855377197,
-            0.48796477913856506,
-            1.6765365600585938,
-            -4.072991847991943,
-            0.8763526082038879,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                0.10819266736507416,
+                0.43043053150177,
+                -0.7322822213172913,
+                -1.923148512840271,
+                1.0195047855377197,
+                0.48796477913856506,
+                1.6765365600585938,
+                -4.072991847991943,
+                0.8763526082038879,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
 
 class ResnetBlock2DTests(unittest.TestCase):
@@ -308,43 +307,44 @@ def test_resnet_default(self):
             output_tensor = resnet_block(sample, temb)
         assert tuple(output_tensor.shape) == (1, 32, 64, 64)
         output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            1.9816107749938965,
-            1.4443503618240356,
-            -1.0354782342910767,
-            0.23985600471496582,
-            -1.0868161916732788,
-            -1.5830397605895996,
-            -0.041037797927856445,
-            -1.2574901580810547,
-            -0.5504958629608154,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                1.9816107749938965,
+                1.4443503618240356,
+                -1.0354782342910767,
+                0.23985600471496582,
+                -1.0868161916732788,
+                -1.5830397605895996,
+                -0.041037797927856445,
+                -1.2574901580810547,
+                -0.5504958629608154,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_restnet_with_use_in_shortcut(self):
         paddle.seed(0)
         sample = paddle.randn(shape=[1, 32, 64, 64])
         temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(
-            in_channels=32, temb_channels=128, use_in_shortcut=True)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True)
         with paddle.no_grad():
             output_tensor = resnet_block(sample, temb)
         assert tuple(output_tensor.shape) == (1, 32, 64, 64)
         output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.9861348867416382,
-            -1.097771406173706,
-            0.268703430891037,
-            0.40997087955474854,
-            -4.26219367980957,
-            1.758486270904541,
-            -0.8979732990264893,
-            0.30774950981140137,
-            3.2780206203460693,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.9861348867416382,
+                -1.097771406173706,
+                0.268703430891037,
+                0.40997087955474854,
+                -4.26219367980957,
+                1.758486270904541,
+                -0.8979732990264893,
+                0.30774950981140137,
+                3.2780206203460693,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_resnet_up(self):
         paddle.seed(0)
@@ -355,91 +355,92 @@ def test_resnet_up(self):
             output_tensor = resnet_block(sample, temb)
         assert tuple(output_tensor.shape) == (1, 32, 128, 128)
         output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            0.2874237298965454,
-            -2.6432056427001953,
-            -2.1900298595428467,
-            -0.48899877071380615,
-            -1.1637755632400513,
-            -1.084446907043457,
-            -1.1333439350128174,
-            0.2726985812187195,
-            -0.014697253704071045,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                0.2874237298965454,
+                -2.6432056427001953,
+                -2.1900298595428467,
+                -0.48899877071380615,
+                -1.1637755632400513,
+                -1.084446907043457,
+                -1.1333439350128174,
+                0.2726985812187195,
+                -0.014697253704071045,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_resnet_down(self):
         paddle.seed(0)
         sample = paddle.randn(shape=[1, 32, 64, 64])
         temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(
-            in_channels=32, temb_channels=128, down=True)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True)
         with paddle.no_grad():
             output_tensor = resnet_block(sample, temb)
         assert tuple(output_tensor.shape) == (1, 32, 32, 32)
         output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            1.54087495803833,
-            0.26700693368911743,
-            -0.540952742099762,
-            2.7190208435058594,
-            -0.09766747057437897,
-            0.23407122492790222,
-            0.47980907559394836,
-            0.6348602771759033,
-            -0.75424242019653322,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                1.54087495803833,
+                0.26700693368911743,
+                -0.540952742099762,
+                2.7190208435058594,
+                -0.09766747057437897,
+                0.23407122492790222,
+                0.47980907559394836,
+                0.6348602771759033,
+                -0.75424242019653322,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_restnet_with_kernel_fir(self):
         paddle.seed(0)
         sample = paddle.randn(shape=[1, 32, 64, 64])
         temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(
-            in_channels=32, temb_channels=128, kernel="fir", down=True)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True)
         with paddle.no_grad():
             output_tensor = resnet_block(sample, temb)
         assert tuple(output_tensor.shape) == (1, 32, 32, 32)
         output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            0.9914248585700989,
-            0.4773162007331848,
-            -0.021942138671875,
-            2.482321262359619,
-            0.18839354813098907,
-            0.1516135334968567,
-            0.7221578359603882,
-            0.3920581340789795,
-            -0.24661940336227417,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                0.9914248585700989,
+                0.4773162007331848,
+                -0.021942138671875,
+                2.482321262359619,
+                0.18839354813098907,
+                0.1516135334968567,
+                0.7221578359603882,
+                0.3920581340789795,
+                -0.24661940336227417,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_restnet_with_kernel_sde_vp(self):
         paddle.seed(0)
         sample = paddle.randn(shape=[1, 32, 64, 64])
         temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(
-            in_channels=32, temb_channels=128, kernel="sde_vp", down=True)
+        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True)
         with paddle.no_grad():
             output_tensor = resnet_block(sample, temb)
         assert tuple(output_tensor.shape) == (1, 32, 32, 32)
         output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            1.54087495803833,
-            0.26700693368911743,
-            -0.540952742099762,
-            2.7190208435058594,
-            -0.09766747057437897,
-            0.23407122492790222,
-            0.47980907559394836,
-            0.6348602771759033,
-            -0.7542424201965332,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                1.54087495803833,
+                0.26700693368911743,
+                -0.540952742099762,
+                2.7190208435058594,
+                -0.09766747057437897,
+                0.23407122492790222,
+                0.47980907559394836,
+                0.6348602771759033,
+                -0.7542424201965332,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
 
 class AttentionBlockTests(unittest.TestCase):
@@ -451,50 +452,49 @@ def test_attention_block_default(self):
             num_head_channels=1,
             rescale_output_factor=1.0,
             eps=1e-06,
-            norm_num_groups=32, )
+            norm_num_groups=32,
+        )
         with paddle.no_grad():
             attention_scores = attentionBlock(sample)
         assert attention_scores.shape == [1, 32, 64, 64]
         output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            1.638939619064331,
-            -0.15776772797107697,
-            -1.1130025386810303,
-            -0.8540273904800415,
-            -0.5696781873703003,
-            -2.0493741035461426,
-            -0.3732607960700989,
-            -1.740313172340393,
-            -0.5271167755126953,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                1.638939619064331,
+                -0.15776772797107697,
+                -1.1130025386810303,
+                -0.8540273904800415,
+                -0.5696781873703003,
+                -2.0493741035461426,
+                -0.3732607960700989,
+                -1.740313172340393,
+                -0.5271167755126953,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_attention_block_sd(self):
         paddle.seed(0)
         sample = paddle.randn(shape=[1, 512, 64, 64])
-        attentionBlock = AttentionBlock(
-            channels=512,
-            rescale_output_factor=1.0,
-            eps=1e-06,
-            norm_num_groups=32)
+        attentionBlock = AttentionBlock(channels=512, rescale_output_factor=1.0, eps=1e-06, norm_num_groups=32)
         with paddle.no_grad():
             attention_scores = attentionBlock(sample)
         assert attention_scores.shape == [1, 512, 64, 64]
         output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.8007570505142212,
-            -0.770350992679596,
-            -3.5278191566467285,
-            -2.0540268421173096,
-            -0.7711739540100098,
-            -0.8278288245201111,
-            -0.48292720317840576,
-            1.6039936542510986,
-            0.626724362373352,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.8007570505142212,
+                -0.770350992679596,
+                -3.5278191566467285,
+                -2.0540268421173096,
+                -0.7711739540100098,
+                -0.8278288245201111,
+                -0.48292720317840576,
+                1.6039936542510986,
+                0.626724362373352,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
 
 class Transformer2DModelTests(unittest.TestCase):
@@ -506,24 +506,26 @@ def test_spatial_transformer_default(self):
             num_attention_heads=1,
             attention_head_dim=32,
             dropout=0.0,
-            cross_attention_dim=None, )
+            cross_attention_dim=None,
+        )
         with paddle.no_grad():
             attention_scores = spatial_transformer_block(sample).sample
         assert attention_scores.shape == [1, 32, 64, 64]
         output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            2.6310853958129883,
-            5.990478515625,
-            0.5715246200561523,
-            -2.5269505977630615,
-            -2.853764057159424,
-            -5.163403511047363,
-            0.2880846858024597,
-            -5.925153732299805,
-            2.316770076751709,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                2.6310853958129883,
+                5.990478515625,
+                0.5715246200561523,
+                -2.5269505977630615,
+                -2.853764057159424,
+                -5.163403511047363,
+                0.2880846858024597,
+                -5.925153732299805,
+                2.316770076751709,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_spatial_transformer_cross_attention_dim(self):
         paddle.seed(0)
@@ -533,25 +535,27 @@ def test_spatial_transformer_cross_attention_dim(self):
             num_attention_heads=2,
             attention_head_dim=32,
             dropout=0.0,
-            cross_attention_dim=64, )
+            cross_attention_dim=64,
+        )
         with paddle.no_grad():
             context = paddle.randn(shape=[1, 4, 64])
             attention_scores = spatial_transformer_block(sample, context).sample
         assert attention_scores.shape == [1, 64, 64, 64]
         output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.08756911754608154,
-            -3.94197940826416,
-            -0.25678586959838867,
-            2.1481714248657227,
-            2.327033042907715,
-            0.29948690533638,
-            1.3845969438552856,
-            0.7825677394866943,
-            1.4856826066970825,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.08756911754608154,
+                -3.94197940826416,
+                -0.25678586959838867,
+                2.1481714248657227,
+                2.327033042907715,
+                0.29948690533638,
+                1.3845969438552856,
+                0.7825677394866943,
+                1.4856826066970825,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_spatial_transformer_timestep(self):
         paddle.seed(0)
@@ -563,44 +567,45 @@ def test_spatial_transformer_timestep(self):
             attention_head_dim=32,
             dropout=0.0,
             cross_attention_dim=64,
-            num_embeds_ada_norm=num_embeds_ada_norm, )
+            num_embeds_ada_norm=num_embeds_ada_norm,
+        )
         with paddle.no_grad():
             timestep_1 = paddle.to_tensor(1, dtype="int64")
             timestep_2 = paddle.to_tensor(2, dtype="int64")
-            attention_scores_1 = spatial_transformer_block(
-                sample, timestep=timestep_1).sample
-            attention_scores_2 = spatial_transformer_block(
-                sample, timestep=timestep_2).sample
+            attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
+            attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample
         assert tuple(attention_scores_1.shape) == (1, 64, 64, 64)
         assert tuple(attention_scores_2.shape) == (1, 64, 64, 64)
         output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
         output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
-        expected_slice_1 = paddle.to_tensor([
-            -0.15322405099868774,
-            -1.265586018562317,
-            -5.424124717712402,
-            -0.7333418130874634,
-            -0.5904415249824524,
-            0.9293081760406494,
-            1.1033945083618164,
-            -5.200987815856934,
-            -0.7598087787628174,
-        ])
-        expected_slice_2 = paddle.to_tensor([
-            0.12572699785232544,
-            -1.0498149394989014,
-            -5.207070350646973,
-            -0.41757693886756897,
-            -0.25374162197113037,
-            1.152648687362671,
-            1.422953724861145,
-            -4.933906078338623,
-            -0.564710259437561,
-        ])
-        assert paddle.allclose(
-            output_slice_1.flatten(), expected_slice_1, atol=0.01)
-        assert paddle.allclose(
-            output_slice_2.flatten(), expected_slice_2, atol=0.01)
+        expected_slice_1 = paddle.to_tensor(
+            [
+                -0.15322405099868774,
+                -1.265586018562317,
+                -5.424124717712402,
+                -0.7333418130874634,
+                -0.5904415249824524,
+                0.9293081760406494,
+                1.1033945083618164,
+                -5.200987815856934,
+                -0.7598087787628174,
+            ]
+        )
+        expected_slice_2 = paddle.to_tensor(
+            [
+                0.12572699785232544,
+                -1.0498149394989014,
+                -5.207070350646973,
+                -0.41757693886756897,
+                -0.25374162197113037,
+                1.152648687362671,
+                1.422953724861145,
+                -4.933906078338623,
+                -0.564710259437561,
+            ]
+        )
+        assert paddle.allclose(output_slice_1.flatten(), expected_slice_1, atol=0.01)
+        assert paddle.allclose(output_slice_2.flatten(), expected_slice_2, atol=0.01)
 
     def test_spatial_transformer_dropout(self):
         paddle.seed(0)
@@ -610,24 +615,26 @@ def test_spatial_transformer_dropout(self):
             num_attention_heads=2,
             attention_head_dim=16,
             dropout=0.3,
-            cross_attention_dim=None, ).eval()
+            cross_attention_dim=None,
+        ).eval()
         with paddle.no_grad():
             attention_scores = spatial_transformer_block(sample).sample
         assert attention_scores.shape == [1, 32, 64, 64]
         output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor([
-            2.535370349884033,
-            6.2350993156433105,
-            0.8244613409042358,
-            -2.6684911251068115,
-            -2.758057117462158,
-            -5.176937103271484,
-            0.3372979760169983,
-            -5.837750434875488,
-            2.3483340740203857,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                2.535370349884033,
+                6.2350993156433105,
+                0.8244613409042358,
+                -2.6684911251068115,
+                -2.758057117462158,
+                -5.176937103271484,
+                0.3372979760169983,
+                -5.837750434875488,
+                2.3483340740203857,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_spatial_transformer_discrete(self):
         paddle.seed(0)
@@ -637,99 +644,75 @@ def test_spatial_transformer_discrete(self):
             num_attention_heads=1,
             attention_head_dim=32,
             num_vector_embeds=num_embed,
-            sample_size=16, ).eval()
+            sample_size=16,
+        ).eval()
         with paddle.no_grad():
             attention_scores = spatial_transformer_block(sample).sample
         assert attention_scores.shape == [1, num_embed - 1, 32]
         output_slice = attention_scores[0, -2:, -3:]
-        expected_slice = paddle.to_tensor([
-            -0.14130862057209015,
-            -0.14278407394886017,
-            -0.498604953289032,
-            -3.2408740520477295,
-            -3.852043390274048,
-            -2.099970579147339,
-        ])
-        assert paddle.allclose(
-            output_slice.flatten(), expected_slice, atol=0.01)
+        expected_slice = paddle.to_tensor(
+            [
+                -0.14130862057209015,
+                -0.14278407394886017,
+                -0.498604953289032,
+                -3.2408740520477295,
+                -3.852043390274048,
+                -2.099970579147339,
+            ]
+        )
+        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
 
     def test_spatial_transformer_default_norm_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, in_channels=32)
-        assert (spatial_transformer_block.transformer_blocks[0].norm1.__class__
-                == paddle.nn.LayerNorm)
-        assert (spatial_transformer_block.transformer_blocks[0].norm3.__class__
-                == paddle.nn.LayerNorm)
+        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
+        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == paddle.nn.LayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm
 
     def test_spatial_transformer_ada_norm_layers(self):
         spatial_transformer_block = Transformer2DModel(
             num_attention_heads=1,
             attention_head_dim=32,
             in_channels=32,
-            num_embeds_ada_norm=5, )
-        assert (spatial_transformer_block.transformer_blocks[0].norm1.__class__
-                == AdaLayerNorm)
-        assert (spatial_transformer_block.transformer_blocks[0].norm3.__class__
-                == paddle.nn.LayerNorm)
+            num_embeds_ada_norm=5,
+        )
+        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm
 
     def test_spatial_transformer_default_ff_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, in_channels=32)
-        assert (
-            spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__
-            == GEGLU)
-        assert (
-            spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__
-            == paddle.nn.Dropout)
-        assert (
-            spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__
-            == paddle.nn.Linear)
+        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear
         dim = 32
         inner_dim = 128
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
-                .proj.weight.shape[0] == dim)
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
-                .proj.weight.shape[1] == inner_dim * 2)
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
-                .weight.shape[0] == inner_dim)
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
-                .weight.shape[1] == dim)
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim * 2
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim
 
     def test_spatial_transformer_geglu_approx_ff_layers(self):
         spatial_transformer_block = Transformer2DModel(
             num_attention_heads=1,
             attention_head_dim=32,
             in_channels=32,
-            activation_fn="geglu-approximate", )
-        assert (
-            spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__
-            == ApproximateGELU)
-        assert (
-            spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__
-            == paddle.nn.Dropout)
-        assert (
-            spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__
-            == paddle.nn.Linear)
+            activation_fn="geglu-approximate",
+        )
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear
         dim = 32
         inner_dim = 128
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
-                .proj.weight.shape[0] == dim)
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[0]
-                .proj.weight.shape[1] == inner_dim)
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
-                .weight.shape[0] == inner_dim)
-        assert (spatial_transformer_block.transformer_blocks[0].ff.net[2]
-                .weight.shape[1] == dim)
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim
 
     def test_spatial_transformer_attention_bias(self):
         spatial_transformer_block = Transformer2DModel(
             num_attention_heads=1,
             attention_head_dim=32,
             in_channels=32,
-            attention_bias=True, )
-        assert (spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias
-                is not None)
-        assert (spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias
-                is not None)
-        assert (spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias
-                is not None)
+            attention_bias=True,
+        )
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None
diff --git a/ppdiffusers/tests/models/test_lora_layers.py b/ppdiffusers/tests/models/test_lora_layers.py
index 14c192e1e5ea8..97335fe48e3b5 100644
--- a/ppdiffusers/tests/models/test_lora_layers.py
+++ b/ppdiffusers/tests/models/test_lora_layers.py
@@ -20,8 +20,12 @@
 import paddle.nn as nn
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, StableDiffusionPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
 from ppdiffusers.models.attention_processor import LoRAAttnProcessor
 from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, floats_tensor
@@ -30,19 +34,16 @@
 def create_unet_lora_layers(unet: nn.Layer):
     lora_attn_procs = {}
     for name in unet.attn_processors.keys():
-        cross_attention_dim = (None if name.endswith("attn1.processor") else
-                               unet.config.cross_attention_dim)
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = unet.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[
-                block_id]
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
-        lora_attn_procs[name] = LoRAAttnProcessor(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
     unet_lora_layers = AttnProcsLayers(lora_attn_procs)
     return lora_attn_procs, unet_lora_layers
 
@@ -52,8 +53,8 @@ def create_text_encoder_lora_layers(text_encoder: nn.Layer):
     for name, module in text_encoder.named_sublayers(include_self=True):
         if name.endswith(TEXT_ENCODER_ATTN_MODULE):
             text_lora_attn_procs[name] = LoRAAttnProcessor(
-                hidden_size=module.out_proj.weight.shape[1],
-                cross_attention_dim=None)
+                hidden_size=module.out_proj.weight.shape[1], cross_attention_dim=None
+            )
 
     text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
     return text_encoder_lora_layers
@@ -70,14 +71,16 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -85,7 +88,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
@@ -95,11 +99,11 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config)
         text_encoder.eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
         text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
@@ -128,11 +132,7 @@ def get_dummy_inputs(self):
 
         generator = paddle.Generator().manual_seed(0)
         noise = floats_tensor((batch_size, num_channels) + sizes)
-        input_ids = paddle.randint(
-            1,
-            sequence_length,
-            size=(batch_size, sequence_length),
-            generator=generator)
+        input_ids = paddle.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
 
         pipeline_inputs = {
             "prompt": "A painting of a squirrel eating a burger",
@@ -158,22 +158,17 @@ def test_lora_save_load(self):
             LoraLoaderMixin.save_lora_weights(
                 save_directory=tmpdirname,
                 unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components[
-                    "text_encoder_lora_layers"],
-                to_diffusers=False, )
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+                to_diffusers=False,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
             sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False)
 
         lora_images = sd_pipe(**pipeline_inputs).images
         lora_image_slice = lora_images[0, -3:, -3:, -1]
 
         # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                paddle.to_tensor(orig_image_slice),
-                paddle.to_tensor(lora_image_slice)))
+        self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
 
     def test_lora_save_load_safetensors(self):
         pipeline_components, lora_components = self.get_dummy_components()
@@ -189,24 +184,18 @@ def test_lora_save_load_safetensors(self):
             LoraLoaderMixin.save_lora_weights(
                 save_directory=tmpdirname,
                 unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components[
-                    "text_encoder_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
                 safe_serialization=True,
-                to_diffusers=True, )
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname,
-                                 "pytorch_lora_weights.safetensors")))
+                to_diffusers=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
             sd_pipe.load_lora_weights(tmpdirname, from_diffusers=True)
 
         lora_images = sd_pipe(**pipeline_inputs).images
         lora_image_slice = lora_images[0, -3:, -3:, -1]
 
         # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                paddle.to_tensor(orig_image_slice),
-                paddle.to_tensor(lora_image_slice)))
+        self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
 
     def test_lora_save_load_legacy(self):
         pipeline_components, lora_components = self.get_dummy_components()
@@ -223,16 +212,11 @@ def test_lora_save_load_legacy(self):
             unet = sd_pipe.unet
             unet.set_attn_processor(unet_lora_attn_procs)
             unet.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
             sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False)
 
         lora_images = sd_pipe(**pipeline_inputs).images
         lora_image_slice = lora_images[0, -3:, -3:, -1]
 
         # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                paddle.to_tensor(orig_image_slice),
-                paddle.to_tensor(lora_image_slice)))
+        self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
diff --git a/ppdiffusers/tests/models/test_modeling_common.py b/ppdiffusers/tests/models/test_modeling_common.py
index 2224b1d99e300..8780b3abc746b 100644
--- a/ppdiffusers/tests/models/test_modeling_common.py
+++ b/ppdiffusers/tests/models/test_modeling_common.py
@@ -45,12 +45,14 @@ def test_cached_files_are_used_when_no_internet(self):
         response_mock.raise_for_status.side_effect = HTTPError
         response_mock.json.return_value = {}
         orig_model = UNet2DConditionModel.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet")
+            "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet"
+        )
         with mock.patch("requests.request", return_value=response_mock):
             model = UNet2DConditionModel.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="unet",
-                local_files_only=True, )
+                local_files_only=True,
+            )
         for p1, p2 in zip(orig_model.parameters(), model.parameters()):
             if (p1 != p2).cast("int64").sum() > 0:
                 assert False, "Parameters not the same!"
@@ -67,13 +69,12 @@ def test_one_request_upon_cached(self):
                     subfolder="unet",
                     cache_dir=tmpdirname,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
             download_requests = [r.method for r in m.request_history]
-            assert (download_requests.count("HEAD") == 2
-                    ), "2 HEAD requests one for config, one for model"
-            assert (download_requests.count("GET") == 2
-                    ), "2 GET requests one for config, one for model"
+            assert download_requests.count("HEAD") == 2, "2 HEAD requests one for config, one for model"
+            assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
 
             with requests_mock.mock(real_http=True) as m:
                 UNet2DConditionModel.from_pretrained(
@@ -81,7 +82,8 @@ def test_one_request_upon_cached(self):
                     subfolder="unet",
                     cache_dir=tmpdirname,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
             cache_requests = [r.method for r in m.request_history]
             # TODO check this
@@ -92,15 +94,15 @@ def test_one_request_upon_cached(self):
         ppdiffusers.utils.import_utils._safetensors_available = True
 
     def test_weight_overwrite(self):
-        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(
-                RuntimeError) as error_context:
+        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(RuntimeError) as error_context:
             UNet2DConditionModel.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="unet",
                 cache_dir=tmpdirname,
                 in_channels=9,
                 from_hf_hub=True,
-                from_diffusers=True, )
+                from_diffusers=True,
+            )
 
         # make sure that error message states what keys are missing
         assert "size mismatch" in str(error_context.exception)
@@ -114,7 +116,8 @@ def test_weight_overwrite(self):
                 low_cpu_mem_usage=False,
                 ignore_mismatched_sizes=True,
                 from_hf_hub=True,
-                from_diffusers=True, )
+                from_diffusers=True,
+            )
 
         assert model.config.in_channels == 9
 
@@ -139,8 +142,7 @@ def test_from_save_pretrained(self):
             if isinstance(new_image, dict):
                 new_image = new_image.sample
         max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-05,
-                             "Models give different forward passes")
+        self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes")
 
     def test_getattr_is_correct(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -183,10 +185,7 @@ def test_getattr_is_correct(self):
         with self.assertRaises(AttributeError) as error:
             model.does_not_exist
 
-        assert (
-            str(error.exception) ==
-            f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
-        )
+        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
 
     def test_from_save_pretrained_variant(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -196,8 +195,7 @@ def test_from_save_pretrained_variant(self):
         model.eval()
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_pretrained(tmpdirname, variant="fp16")
-            new_model = self.model_class.from_pretrained(
-                tmpdirname, variant="fp16")
+            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
             if hasattr(new_model, "set_default_attn_processor"):
                 new_model.set_default_attn_processor()
             # non-variant cannot be loaded
@@ -208,8 +206,7 @@ def test_from_save_pretrained_variant(self):
             # support diffusion_pytorch_model.bin and model_state.pdparams
             assert "Error no file named model_state.pdparams found in directory" in str(
                 error_context.exception
-            ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str(
-                error_context.exception)
+            ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
         with paddle.no_grad():
 
             image = model(**inputs_dict)
@@ -219,8 +216,7 @@ def test_from_save_pretrained_variant(self):
             if isinstance(new_image, dict):
                 new_image = new_image.sample
         max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-05,
-                             "Models give different forward passes")
+        self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes")
 
     def test_from_save_pretrained_dtype(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -231,11 +227,9 @@ def test_from_save_pretrained_dtype(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.to(dtype=dtype)
                 model.save_pretrained(tmpdirname)
-                new_model = self.model_class.from_pretrained(
-                    tmpdirname, paddle_dtype=dtype)
+                new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype)
                 assert new_model.dtype == dtype
-                new_model = self.model_class.from_pretrained(
-                    tmpdirname, paddle_dtype=dtype)
+                new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype)
                 assert new_model.dtype == dtype
 
     def test_determinism(self):
@@ -266,8 +260,7 @@ def test_output(self):
                 output = output.sample
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_forward_with_norm_groups(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -281,8 +274,7 @@ def test_forward_with_norm_groups(self):
                 output = output.sample
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_forward_signature(self):
         init_dict, _ = self.prepare_init_args_and_inputs_for_common()
@@ -320,8 +312,7 @@ def test_training(self):
         output = model(**inputs_dict)
         if isinstance(output, dict):
             output = output.sample
-        noise = paddle.randn(
-            shape=list((inputs_dict["sample"].shape[0], ) + self.output_shape))
+        noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape))
         loss = paddle.nn.functional.mse_loss(input=output, label=noise)
         loss.backward()
 
@@ -333,8 +324,7 @@ def test_ema_training(self):
         output = model(**inputs_dict)
         if isinstance(output, dict):
             output = output.sample
-        noise = paddle.randn(
-            shape=list((inputs_dict["sample"].shape[0], ) + self.output_shape))
+        noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape))
         loss = paddle.nn.functional.mse_loss(input=output, label=noise)
         loss.backward()
         ema_model.step(model.parameters())
@@ -346,12 +336,10 @@ def set_nan_tensor_to_zero(t):
 
         def recursive_check(tuple_object, dict_object):
             if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object, dict_object.values()):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
                     recursive_check(tuple_iterable_value, dict_iterable_value)
             elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object.values(), dict_object.values()):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
                     recursive_check(tuple_iterable_value, dict_iterable_value)
             elif tuple_object is None:
                 return
@@ -360,7 +348,8 @@ def recursive_check(tuple_object, dict_object):
                     paddle.allclose(
                         set_nan_tensor_to_zero(tuple_object),
                         set_nan_tensor_to_zero(dict_object),
-                        atol=1e-05, ),
+                        atol=1e-05,
+                    ),
                     msg=f"Tuple and dict output are not equal. Difference: {paddle.max(x=paddle.abs(x=tuple_object - dict_object))}. Tuple has `nan`: {paddle.isnan(x=tuple_object).any()} and `inf`: {paddle.isinf(x=tuple_object)}. Dict has `nan`: {paddle.isnan(x=dict_object).any()} and `inf`: {paddle.isinf(x=dict_object)}.",
                 )
 
@@ -384,8 +373,7 @@ def test_enable_disable_gradient_checkpointing(self):
         self.assertFalse(model.is_gradient_checkpointing)
 
     def test_deprecated_kwargs(self):
-        has_kwarg_in_model_class = (
-            "kwargs" in inspect.signature(self.model_class.__init__).parameters)
+        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
         has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
         if has_kwarg_in_model_class and not has_deprecated_kwarg:
             raise ValueError(
diff --git a/ppdiffusers/tests/models/test_models_unet_1d.py b/ppdiffusers/tests/models/test_models_unet_1d.py
index 8ff48ee303f86..8d1339ed5c4dc 100644
--- a/ppdiffusers/tests/models/test_models_unet_1d.py
+++ b/ppdiffusers/tests/models/test_models_unet_1d.py
@@ -79,9 +79,9 @@ def prepare_init_args_and_inputs_for_common(self):
                 "DownResnetBlock1D",
                 "DownResnetBlock1D",
                 "DownResnetBlock1D",
-                "DownResnetBlock1D", ),
-            "up_block_types":
-            ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
+                "DownResnetBlock1D",
+            ),
+            "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
             "act_fn": "mish",
         }
         inputs_dict = self.dummy_input
@@ -91,38 +91,37 @@ def test_from_pretrained_hub(self):
         model, loading_info = UNet1DModel.from_pretrained(
             "bglick13/hopper-medium-v2-value-function-hor32",
             output_loading_info=True,
-            subfolder="unet", )
+            subfolder="unet",
+        )
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
         image = model(**self.dummy_input)
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
+        model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
         paddle.seed(0)
         num_features = model.config.in_channels
         seq_len = 16
-        noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(
-            perm=[0, 2, 1])
-        time_step = paddle.full(shape=(num_features, ), fill_value=0)
+        noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1])
+        time_step = paddle.full(shape=(num_features,), fill_value=0)
         with paddle.no_grad():
             output = model(noise, time_step).sample.permute(0, 2, 1)
         output_slice = output[0, -3:, -3:].flatten()
-        expected_output_slice = paddle.to_tensor([
-            -0.2857576608657837,
-            -0.9908187389373779,
-            0.2976357340812683,
-            -0.8677187561988831,
-            -0.21778395771980286,
-            0.08095654845237732,
-            -0.5871752500534058,
-            0.3299727439880371,
-            -0.17421625554561615,
-        ])
-        self.assertTrue(
-            paddle.allclose(
-                output_slice, expected_output_slice, rtol=0.001))
+        expected_output_slice = paddle.to_tensor(
+            [
+                -0.2857576608657837,
+                -0.9908187389373779,
+                0.2976357340812683,
+                -0.8677187561988831,
+                -0.21778395771980286,
+                0.08095654845237732,
+                -0.5871752500534058,
+                0.3299727439880371,
+                -0.17421625554561615,
+            ]
+        )
+        self.assertTrue(paddle.allclose(output_slice, expected_output_slice, rtol=0.001))
 
     def test_forward_with_norm_groups(self):
         pass
@@ -133,9 +132,9 @@ def test_unet_1d_maestro(self):
         model_id = "harmonai/maestro-150k"
         model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
         sample_size = 65536
-        noise = paddle.sin(x=paddle.arange(
-            start=sample_size,
-            dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1]))
+        noise = paddle.sin(
+            x=paddle.arange(start=sample_size, dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1])
+        )
         timestep = paddle.to_tensor([1.0])  # must cast float32
         with paddle.no_grad():
             output = model(noise, timestep).sample
@@ -187,8 +186,7 @@ def test_output(self):
                 output = output.sample
         self.assertIsNotNone(output)
         expected_shape = [inputs_dict["sample"].shape[0], 1]
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_ema_training(self):
         pass
@@ -225,7 +223,8 @@ def test_from_pretrained_hub(self):
         value_function, vf_loading_info = UNet1DModel.from_pretrained(
             "bglick13/hopper-medium-v2-value-function-hor32",
             output_loading_info=True,
-            subfolder="value_function", )
+            subfolder="value_function",
+        )
         self.assertIsNotNone(value_function)
         self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
         image = value_function(**self.dummy_input)
@@ -235,19 +234,17 @@ def test_output_pretrained(self):
         value_function, vf_loading_info = UNet1DModel.from_pretrained(
             "bglick13/hopper-medium-v2-value-function-hor32",
             output_loading_info=True,
-            subfolder="value_function", )
+            subfolder="value_function",
+        )
         paddle.seed(0)
         num_features = value_function.config.in_channels
         seq_len = 14
-        noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(
-            perm=[0, 2, 1])
-        time_step = paddle.full(shape=(num_features, ), fill_value=0)
+        noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1])
+        time_step = paddle.full(shape=(num_features,), fill_value=0)
         with paddle.no_grad():
             output = value_function(noise, time_step).sample
         expected_output_slice = paddle.to_tensor([291.51135254] * seq_len)
-        self.assertTrue(
-            paddle.allclose(
-                output.squeeze(-1), expected_output_slice, rtol=0.001))
+        self.assertTrue(paddle.allclose(output.squeeze(-1), expected_output_slice, rtol=0.001))
 
     def test_forward_with_norm_groups(self):
         pass
diff --git a/ppdiffusers/tests/models/test_models_unet_2d.py b/ppdiffusers/tests/models/test_models_unet_2d.py
index 6473ab0323f19..15147e00742e8 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d.py
@@ -97,22 +97,19 @@ def prepare_init_args_and_inputs_for_common(self):
         return init_dict, inputs_dict
 
     def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
         image = model(**self.dummy_input).sample
         assert image is not None, "Make sure output is not None"
 
     def test_from_pretrained_accelerate(self):
-        model, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         image = model(**self.dummy_input).sample
         assert image is not None, "Make sure output is not None"
 
     def test_from_pretrained_accelerate_wont_change_results(self):
-        model_accelerate, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         model_accelerate
         model_accelerate.eval()
         noise = paddle.randn(
@@ -122,7 +119,8 @@ def test_from_pretrained_accelerate_wont_change_results(self):
                 model_accelerate.config.sample_size,
                 model_accelerate.config.sample_size,
             ],
-            generator=paddle.Generator().manual_seed(0), )
+            generator=paddle.Generator().manual_seed(0),
+        )
         time_step = paddle.to_tensor([10] * noise.shape[0])
         arr_accelerate = model_accelerate(noise, time_step)["sample"]
         del model_accelerate
@@ -130,7 +128,8 @@ def test_from_pretrained_accelerate_wont_change_results(self):
         gc.collect()
         model_normal_load, _ = UNet2DModel.from_pretrained(
             "fusing/unet-ldm-dummy-update",
-            output_loading_info=True, )
+            output_loading_info=True,
+        )
         model_normal_load.eval()
         arr_normal_load = model_normal_load(noise, time_step)["sample"]
         assert paddle_all_close(arr_accelerate, arr_normal_load, rtol=0.001)
@@ -145,25 +144,26 @@ def test_output_pretrained(self):
                 model.config.sample_size,
                 model.config.sample_size,
             ],
-            generator=paddle.Generator().manual_seed(0), )
+            generator=paddle.Generator().manual_seed(0),
+        )
         time_step = paddle.to_tensor([10] * noise.shape[0])
         with paddle.no_grad():
             output = model(noise, time_step).sample
         output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor([
-            0.43855608,
-            -10.29346752,
-            -9.60953522,
-            -8.39902020,
-            -16.29206276,
-            -13.07511997,
-            -9.30383205,
-            -13.69859409,
-            -10.52999401,
-        ])
-        self.assertTrue(
-            paddle_all_close(
-                output_slice, expected_output_slice, rtol=0.001))
+        expected_output_slice = paddle.to_tensor(
+            [
+                0.43855608,
+                -10.29346752,
+                -9.60953522,
+                -8.39902020,
+                -16.29206276,
+                -13.07511997,
+                -9.30383205,
+                -13.69859409,
+                -10.52999401,
+            ]
+        )
+        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.001))
 
 
 class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
@@ -213,8 +213,7 @@ def prepare_init_args_and_inputs_for_common(self):
 
     @slow
     def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained(
-            "google/ncsnpp-celebahq-256", output_loading_info=True)
+        model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
         inputs = self.dummy_input
@@ -235,24 +234,23 @@ def test_output_pretrained_ve_mid(self):
         with paddle.no_grad():
             output = model(noise, time_step).sample
         output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        expected_output_slice = paddle.to_tensor([
-            -4836.2231,
-            -6487.1387,
-            -3816.7969,
-            -7964.9253,
-            -10966.2842,
-            -20043.6016,
-            8137.0571,
-            2340.3499,
-            544.6114,
-        ])
-        self.assertTrue(
-            paddle_all_close(
-                output_slice, expected_output_slice, rtol=0.01))
+        expected_output_slice = paddle.to_tensor(
+            [
+                -4836.2231,
+                -6487.1387,
+                -3816.7969,
+                -7964.9253,
+                -10966.2842,
+                -20043.6016,
+                8137.0571,
+                2340.3499,
+                544.6114,
+            ]
+        )
+        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
 
     def test_output_pretrained_ve_large(self):
-        model = UNet2DModel.from_pretrained(
-            "fusing/ncsnpp-ffhq-ve-dummy-update")
+        model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
         paddle.seed(0)
         batch_size = 4
         num_channels = 3
@@ -262,13 +260,10 @@ def test_output_pretrained_ve_large(self):
         with paddle.no_grad():
             output = model(noise, time_step).sample
         output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        expected_output_slice = paddle.to_tensor([
-            -0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227,
-            0.0256
-        ])
-        self.assertTrue(
-            paddle_all_close(
-                output_slice, expected_output_slice, rtol=0.01))
+        expected_output_slice = paddle.to_tensor(
+            [-0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227, 0.0256]
+        )
+        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
 
     def test_forward_with_norm_groups(self):
         pass
diff --git a/ppdiffusers/tests/models/test_models_unet_2d_condition.py b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
index 085837f1fb0dd..6b9930399b0c3 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
@@ -24,9 +24,17 @@
 
 from ppdiffusers import UNet2DConditionModel
 from ppdiffusers.models.attention_processor import (
-    CustomDiffusionAttnProcessor, LoRAAttnProcessor)
-from ppdiffusers.utils import (floats_tensor, load_ppnlp_numpy, logging,
-                               paddle_all_close, require_paddle_gpu, slow)
+    CustomDiffusionAttnProcessor,
+    LoRAAttnProcessor,
+)
+from ppdiffusers.utils import (
+    floats_tensor,
+    load_ppnlp_numpy,
+    logging,
+    paddle_all_close,
+    require_paddle_gpu,
+    slow,
+)
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 
 from .test_modeling_common import ModelTesterMixin
@@ -34,50 +42,41 @@
 logger = logging.get_logger(__name__)
 
 
-def create_lora_layers(model, mock_weights: bool=True):
+def create_lora_layers(model, mock_weights: bool = True):
     lora_attn_procs = {}
     for name in model.attn_processors.keys():
-        cross_attention_dim = (None if name.endswith("attn1.processor") else
-                               model.config.cross_attention_dim)
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = model.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[
-                block_id]
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = model.config.block_out_channels[block_id]
-        lora_attn_procs[name] = LoRAAttnProcessor(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
         if mock_weights:
             with paddle.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_q_lora.up.weight + 1)
-                lora_attn_procs[name].to_k_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_k_lora.up.weight + 1)
-                lora_attn_procs[name].to_v_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_v_lora.up.weight + 1)
-                lora_attn_procs[name].to_out_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_out_lora.up.weight + 1)
+                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
     return lora_attn_procs
 
 
-def create_custom_ppdiffusion_layers(model, mock_weights: bool=True):
+def create_custom_ppdiffusion_layers(model, mock_weights: bool = True):
     train_kv = True
     train_q_out = True
     custom_diffusion_attn_procs = {}
 
     st = model.state_dict()
     for name, _ in model.attn_processors.items():
-        cross_attention_dim = (None if name.endswith("attn1.processor") else
-                               model.config.cross_attention_dim)
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = model.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[
-                block_id]
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = model.config.block_out_channels[block_id]
@@ -87,36 +86,33 @@ def create_custom_ppdiffusion_layers(model, mock_weights: bool=True):
             "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
         }
         if train_q_out:
-            weights["to_q_custom_diffusion.weight"] = st[layer_name +
-                                                         ".to_q.weight"]
-            weights["to_out_custom_diffusion.0.weight"] = st[layer_name +
-                                                             ".to_out.0.weight"]
-            weights["to_out_custom_diffusion.0.bias"] = st[layer_name +
-                                                           ".to_out.0.bias"]
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
         if cross_attention_dim is not None:
             custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
                 train_kv=train_kv,
                 train_q_out=train_q_out,
                 hidden_size=hidden_size,
-                cross_attention_dim=cross_attention_dim, )
+                cross_attention_dim=cross_attention_dim,
+            )
             custom_diffusion_attn_procs[name].load_dict(weights)
             if mock_weights:
                 # add 1 to weights to mock trained weights
                 with paddle.no_grad():
-                    custom_diffusion_attn_procs[
-                        name].to_k_custom_diffusion.weight.set_value(
-                            custom_diffusion_attn_procs[
-                                name].to_k_custom_diffusion.weight + 1)
-                    custom_diffusion_attn_procs[
-                        name].to_v_custom_diffusion.weight.set_value(
-                            custom_diffusion_attn_procs[
-                                name].to_v_custom_diffusion.weight + 1)
+                    custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight.set_value(
+                        custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight + 1
+                    )
+                    custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight.set_value(
+                        custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight + 1
+                    )
         else:
             custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
                 train_kv=False,
                 train_q_out=False,
                 hidden_size=hidden_size,
-                cross_attention_dim=cross_attention_dim, )
+                cross_attention_dim=cross_attention_dim,
+            )
     del st
     return custom_diffusion_attn_procs
 
@@ -165,9 +161,10 @@ def test_xformers_enable_works(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict)
         model.enable_xformers_memory_efficient_attention()
-        assert (model.mid_block.attentions[0].transformer_blocks[0]
-                .attn1.processor.__class__.__name__ == "XFormersAttnProcessor"
-                ), "xformers is not enabled"
+        assert (
+            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+            == "XFormersAttnProcessor"
+        ), "xformers is not enabled"
 
     def test_gradient_checkpointing(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -190,9 +187,7 @@ def test_gradient_checkpointing(self):
         named_params = dict(model.named_parameters())
         named_params_2 = dict(model_2.named_parameters())
         for name, param in named_params.items():
-            self.assertTrue(
-                paddle_all_close(
-                    param.grad, named_params_2[name].grad, atol=5e-05))
+            self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-05))
 
     def test_model_with_attention_head_dim_tuple(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -205,8 +200,7 @@ def test_model_with_attention_head_dim_tuple(self):
                 output = output.sample
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_model_with_use_linear_projection(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -219,8 +213,7 @@ def test_model_with_use_linear_projection(self):
                 output = output.sample
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_model_with_cross_attention_dim_tuple(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -238,8 +231,7 @@ def test_model_with_cross_attention_dim_tuple(self):
 
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_model_with_simple_projection(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -262,8 +254,7 @@ def test_model_with_simple_projection(self):
 
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_model_with_class_embeddings_concat(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -287,8 +278,7 @@ def test_model_with_class_embeddings_concat(self):
 
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_model_attention_slicing(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -327,34 +317,32 @@ class AttnEasyProc(nn.Layer):
             def __init__(self, num):
                 super().__init__()
                 self.weight = self.create_parameter(
-                    (1, ),
+                    (1,),
                     dtype=paddle.get_default_dtype(),
-                    default_initializer=nn.initializer.Constant(num), )
+                    default_initializer=nn.initializer.Constant(num),
+                )
                 self.is_run = False
                 self.number = 0
                 self.counter = 0
 
             def __call__(
-                    self,
-                    attn,
-                    hidden_states,
-                    encoder_hidden_states=None,
-                    attention_mask=None,
-                    number=None, ):
+                self,
+                attn,
+                hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=None,
+                number=None,
+            ):
                 batch_size, sequence_length, _ = hidden_states.shape
-                attention_mask = attn.prepare_attention_mask(
-                    attention_mask, sequence_length, batch_size)
+                attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
                 query = attn.to_q(hidden_states)
-                encoder_hidden_states = (encoder_hidden_states
-                                         if encoder_hidden_states is not None
-                                         else hidden_states)
+                encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
                 key = attn.to_k(encoder_hidden_states)
                 value = attn.to_v(encoder_hidden_states)
                 query = attn.head_to_batch_dim(query)
                 key = attn.head_to_batch_dim(key)
                 value = attn.head_to_batch_dim(value)
-                attention_probs = attn.get_attention_scores(query, key,
-                                                            attention_mask)
+                attention_probs = attn.get_attention_scores(query, key, attention_mask)
                 hidden_states = paddle.matmul(attention_probs, value)
                 hidden_states = attn.batch_to_head_dim(hidden_states)
                 hidden_states = attn.to_out[0](hidden_states)
@@ -385,12 +373,9 @@ def test_lora_processors(self):
         model.set_attn_processor(lora_attn_procs)
         model.set_attn_processor(model.attn_processors)
         with paddle.no_grad():
-            sample2 = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
         assert (sample1 - sample2).abs().max() < 0.0001
         assert (sample3 - sample4).abs().max() < 0.0001
         assert (sample2 - sample3).abs().max() > 0.0001
@@ -405,20 +390,16 @@ def test_lora_save_load(self):
         lora_attn_procs = create_lora_layers(model)
         model.set_attn_processor(lora_attn_procs)
         with paddle.no_grad():
-            sample = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
             new_model.load_attn_procs(tmpdirname, from_diffusers=False)
 
         with paddle.no_grad():
-            new_sample = new_model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         assert (sample - new_sample).abs().max() < 1e-4
 
@@ -441,23 +422,16 @@ def test_lora_save_load_safetensors(self):
         model.set_attn_processor(lora_attn_procs)
 
         with paddle.no_grad():
-            sample = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(
-                tmpdirname, safe_serialization=True, to_diffusers=True)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname,
-                                 "pytorch_lora_weights.safetensors")))
+            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(
-                tmpdirname, from_diffusers=True, use_safetensors=True)
+            new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
         with paddle.no_grad():
-            new_sample = new_model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
         assert (sample - new_sample).abs().max() < 0.0001
         assert (sample - old_sample).abs().max() > 0.0001
 
@@ -475,16 +449,15 @@ def test_lora_save_safetensors_load_torch(self):
         # Saving as torch, properly reloads with directly filename
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_attn_procs(tmpdirname, to_diffusers=True)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
             new_model.load_attn_procs(
                 tmpdirname,
                 weight_name="pytorch_lora_weights.bin",
                 from_diffusers=True,
-                use_safetensors=False, )
+                use_safetensors=False,
+            )
 
     def test_lora_save_torch_force_load_safetensors_error(self):
         pass
@@ -499,8 +472,7 @@ def test_lora_on_off(self):
         lora_attn_procs = create_lora_layers(model)
         model.set_attn_processor(lora_attn_procs)
         with paddle.no_grad():
-            sample = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
         model.set_default_attn_processor()
         with paddle.no_grad():
             new_sample = model(**inputs_dict).sample
@@ -538,8 +510,7 @@ def test_custom_diffusion_processors(self):
         with paddle.no_grad():
             sample1 = model(**inputs_dict).sample
 
-        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(
-            model, mock_weights=False)
+        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
 
         # make sure we can set a list of attention processors
         model.set_attn_processor(custom_diffusion_attn_procs)
@@ -564,8 +535,7 @@ def test_custom_diffusion_save_load(self):
         with paddle.no_grad():
             old_sample = model(**inputs_dict).sample
 
-        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(
-            model, mock_weights=False)
+        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
         model.set_attn_processor(custom_diffusion_attn_procs)
 
         with paddle.no_grad():
@@ -573,16 +543,14 @@ def test_custom_diffusion_save_load(self):
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname,
-                                 "paddle_custom_diffusion_weights.pdparams")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_custom_diffusion_weights.pdparams")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
             new_model.load_attn_procs(
                 tmpdirname,
                 weight_name="paddle_custom_diffusion_weights.pdparams",
-                from_diffusers=False, )
+                from_diffusers=False,
+            )
 
         with paddle.no_grad():
             new_sample = new_model(**inputs_dict).sample
@@ -604,8 +572,7 @@ def test_custom_diffusion_xformers_on_off(self):
 
         paddle.seed(0)
         model = self.model_class(**init_dict)
-        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(
-            model, mock_weights=False)
+        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
         model.set_attn_processor(custom_diffusion_attn_procs)
 
         # default
@@ -634,20 +601,15 @@ def tearDown(self):
 
     def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
         dtype = paddle.float16 if fp16 else paddle.float32
-        image = paddle.to_tensor(data=load_ppnlp_numpy(
-            self.get_file_format(seed, shape))).cast(dtype)
+        image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
         return image
 
-    def get_unet_model(self,
-                       fp16=False,
-                       model_id="CompVis/stable-diffusion-v1-4"):
+    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
         revision = "fp16" if fp16 else None
         paddle_dtype = paddle.float16 if fp16 else paddle.float32
         model = UNet2DConditionModel.from_pretrained(
-            model_id,
-            subfolder="unet",
-            paddle_dtype=paddle_dtype,
-            revision=revision)
+            model_id, subfolder="unet", paddle_dtype=paddle_dtype, revision=revision
+        )
         model.eval()
         return model
 
@@ -659,10 +621,7 @@ def test_set_attention_slice_auto(self):
         encoder_hidden_states = self.get_encoder_hidden_states(33)
         timestep = 1
         with paddle.no_grad():
-            _ = unet(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         mem_bytes = paddle.device.cuda.memory_allocated()
         assert mem_bytes < 5 * 10**9
 
@@ -674,10 +633,7 @@ def test_set_attention_slice_max(self):
         encoder_hidden_states = self.get_encoder_hidden_states(33)
         timestep = 1
         with paddle.no_grad():
-            _ = unet(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         mem_bytes = paddle.device.cuda.memory_allocated()
         assert mem_bytes < 5 * 10**9
 
@@ -689,10 +645,7 @@ def test_set_attention_slice_int(self):
         encoder_hidden_states = self.get_encoder_hidden_states(33)
         timestep = 1
         with paddle.no_grad():
-            _ = unet(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         mem_bytes = paddle.device.cuda.memory_allocated()
         assert mem_bytes < 5 * 10**9
 
@@ -705,49 +658,35 @@ def test_set_attention_slice_list(self):
         encoder_hidden_states = self.get_encoder_hidden_states(33)
         timestep = 1
         with paddle.no_grad():
-            _ = unet(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         mem_bytes = paddle.device.cuda.memory_allocated()
         assert mem_bytes < 5 * 10**9
 
     def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
         dtype = "float16" if fp16 else "float32"
-        hidden_states = paddle.to_tensor(data=load_ppnlp_numpy(
-            self.get_file_format(seed, shape))).cast(dtype)
+        hidden_states = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
         return hidden_states
 
-    @parameterized.expand([
-        [
-            33, 4,
-            [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435]
-        ],
+    @parameterized.expand(
         [
-            47,
-            0.55,
+            [33, 4, [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435]],
             [
-                -0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719,
-                -0.0207
+                47,
+                0.55,
+                [-0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719, -0.0207],
             ],
-        ],
-        [
-            21,
-            0.89,
             [
-                -0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091,
-                0.1778
+                21,
+                0.89,
+                [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778],
             ],
-        ],
-        [
-            9,
-            1000,
             [
-                0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241,
-                -0.4424
+                9,
+                1000,
+                [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
         model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
@@ -755,93 +694,69 @@ def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
         encoder_hidden_states = self.get_encoder_hidden_states(seed)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == latents.shape
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            83,
-            4,
             [
-                -0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125,
-                -0.5806
+                83,
+                4,
+                [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806],
             ],
-        ],
-        [
-            17,
-            0.55,
             [
-                -0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743,
-                0.0701
+                17,
+                0.55,
+                [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701],
             ],
-        ],
-        [
-            8,
-            0.89,
             [
-                -0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839,
-                0.4639
+                8,
+                0.89,
+                [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639],
             ],
-        ],
-        [
-            3,
-            1000,
             [
-                -0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325,
-                -1.0078
+                3,
+                1000,
+                [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(
-            model_id="CompVis/stable-diffusion-v1-4", fp16=True)
+        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
         latents = self.get_latents(seed, fp16=True)
         encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == latents.shape
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
 
-    @parameterized.expand([
-        [
-            33, 4,
-            [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]
-        ],
+    @parameterized.expand(
         [
-            47,
-            0.55,
+            [33, 4, [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
             [
-                -0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114,
-                -0.0436
+                47,
+                0.55,
+                [-0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114, -0.0436],
             ],
-        ],
-        [
-            21,
-            0.89,
-            [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175],
-        ],
-        [
-            9,
-            1000,
             [
-                0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192,
-                -0.4423
+                21,
+                0.89,
+                [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175],
             ],
-        ],
-    ])
+            [
+                9,
+                1000,
+                [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423],
+            ],
+        ]
+    )
     @require_paddle_gpu
     def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
         model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
@@ -849,199 +764,151 @@ def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
         encoder_hidden_states = self.get_encoder_hidden_states(seed)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == latents.shape
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            83,
-            4,
             [
-                -0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395,
-                -0.5972
+                83,
+                4,
+                [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972],
             ],
-        ],
-        [
-            17,
-            0.55,
             [
-                -0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669,
-                0.0322
+                17,
+                0.55,
+                [-0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322],
+            ],
+            [
+                8,
+                0.89,
+                [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319],
             ],
-        ],
-        [
-            8,
-            0.89,
-            [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319],
-        ],
-        [
-            3,
-            1000,
             [
-                -0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028,
-                -1.002
+                3,
+                1000,
+                [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028, -1.002],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(
-            model_id="runwayml/stable-diffusion-v1-5", fp16=True)
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True)
         latents = self.get_latents(seed, fp16=True)
         encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == latents.shape
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            33,
-            4,
             [
-                -0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085,
-                -0.4858
+                33,
+                4,
+                [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858],
             ],
-        ],
-        [
-            47,
-            0.55,
             [
-                -0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169,
-                0.9073
+                47,
+                0.55,
+                [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073],
+            ],
+            [
+                21,
+                0.89,
+                [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043],
             ],
-        ],
-        [
-            21,
-            0.89,
-            [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043],
-        ],
-        [
-            9,
-            1000,
             [
-                0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149,
-                -1.8931
+                9,
+                1000,
+                [0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149, -1.8931],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(
-            model_id="runwayml/stable-diffusion-inpainting")
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
         latents = self.get_latents(seed, shape=(4, 9, 64, 64))
         encoder_hidden_states = self.get_encoder_hidden_states(seed)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == [4, 4, 64, 64]
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            83,
-            4,
             [
-                -0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388,
-                1.1387
+                83,
+                4,
+                [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387],
             ],
-        ],
-        [
-            17,
-            0.55,
-            [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026],
-        ],
-        [
-            8,
-            0.89,
             [
-                -0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395,
-                -0.3486
+                17,
+                0.55,
+                [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026],
             ],
-        ],
-        [
-            3,
-            1000,
             [
-                0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105,
-                -0.9741
+                8,
+                0.89,
+                [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486],
             ],
-        ],
-    ])
+            [
+                3,
+                1000,
+                [0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741],
+            ],
+        ]
+    )
     @require_paddle_gpu
     def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(
-            model_id="runwayml/stable-diffusion-inpainting", fp16=True)
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True)
         latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
         encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == [4, 4, 64, 64]
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
 
-    @parameterized.expand([
-        [
-            83, 4,
-            [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231]
-        ],
-        [
-            17,
-            0.55,
-            [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458],
-        ],
+    @parameterized.expand(
         [
-            8,
-            0.89,
+            [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231]],
             [
-                -0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996,
-                0.2139
+                17,
+                0.55,
+                [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458],
+            ],
+            [
+                8,
+                0.89,
+                [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139],
             ],
-        ],
-        [
-            3,
-            1000,
             [
-                0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234,
-                -0.0539
+                3,
+                1000,
+                [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234, -0.0539],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(
-            model_id="stabilityai/stable-diffusion-2", fp16=True)
+        model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
         latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(
-            seed, shape=(4, 77, 1024), fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
         timestep = paddle.to_tensor([timestep], dtype="int64")
         with paddle.no_grad():
-            sample = model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states).sample
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
         assert sample.shape == latents.shape
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
diff --git a/ppdiffusers/tests/models/test_models_unet_3d_condition.py b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
index 12479b35ac6f0..ca2f44b1edd9f 100644
--- a/ppdiffusers/tests/models/test_models_unet_3d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
@@ -20,8 +20,7 @@
 import paddle
 
 from ppdiffusers.models import UNet3DConditionModel
-from ppdiffusers.models.attention_processor import (AttnProcessor,
-                                                    LoRAAttnProcessor)
+from ppdiffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor
 from ppdiffusers.utils import floats_tensor, logging
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 
@@ -30,20 +29,18 @@
 logger = logging.get_logger(__name__)
 
 
-def create_lora_layers(model, mock_weights: bool=True):
+def create_lora_layers(model, mock_weights: bool = True):
     lora_attn_procs = {}
     for name in model.attn_processors.keys():
         has_cross_attention = name.endswith("attn2.processor") and not (
-            name.startswith("transformer_in") or
-            "temp_attentions" in name.split("."))
-        cross_attention_dim = (model.config.cross_attention_dim
-                               if has_cross_attention else None)
+            name.startswith("transformer_in") or "temp_attentions" in name.split(".")
+        )
+        cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
         if name.startswith("mid_block"):
             hidden_size = model.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[
-                block_id]
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = model.config.block_out_channels[block_id]
@@ -51,20 +48,15 @@ def create_lora_layers(model, mock_weights: bool=True):
             # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
             hidden_size = 8 * model.config.attention_head_dim
 
-        lora_attn_procs[name] = LoRAAttnProcessor(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
 
         if mock_weights:
             # add 1 to weights to mock trained weights
             with paddle.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_q_lora.up.weight + 1)
-                lora_attn_procs[name].to_k_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_k_lora.up.weight + 1)
-                lora_attn_procs[name].to_v_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_v_lora.up.weight + 1)
-                lora_attn_procs[name].to_out_lora.up.weight.set_value(
-                    lora_attn_procs[name].to_out_lora.up.weight + 1)
+                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
     return lora_attn_procs
 
 
@@ -99,7 +91,8 @@ def prepare_init_args_and_inputs_for_common(self):
             "block_out_channels": (32, 64),
             "down_block_types": (
                 "CrossAttnDownBlock3D",
-                "DownBlock3D", ),
+                "DownBlock3D",
+            ),
             "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
             "cross_attention_dim": 32,
             "attention_head_dim": 8,
@@ -121,9 +114,10 @@ def test_xformers_enable_works(self):
 
         model.enable_xformers_memory_efficient_attention()
 
-        assert (model.mid_block.attentions[0].transformer_blocks[0]
-                .attn1.processor.__class__.__name__ == "XFormersAttnProcessor"
-                ), "xformers is not enabled"
+        assert (
+            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+            == "XFormersAttnProcessor"
+        ), "xformers is not enabled"
 
     # Overriding to set `norm_num_groups` needs to be different for this model.
     def test_forward_with_norm_groups(self):
@@ -140,8 +134,7 @@ def test_forward_with_norm_groups(self):
                 output = output.sample
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape,
-                         "Input and output shapes do not match")
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     # Overriding since the UNet3D outputs a different structure.
     def test_determinism(self):
@@ -199,12 +192,9 @@ def test_lora_processors(self):
         model.set_attn_processor(model.attn_processors)
 
         with paddle.no_grad():
-            sample2 = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         assert (sample1 - sample2).abs().max() < 1e-4
         assert (sample3 - sample4).abs().max() < 1e-4
@@ -227,23 +217,20 @@ def test_lora_save_load(self):
         model.set_attn_processor(lora_attn_procs)
 
         with paddle.no_grad():
-            sample = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_attn_procs(
                 tmpdirname,
-                to_diffusers=False, )
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+                to_diffusers=False,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
             new_model.load_attn_procs(tmpdirname, from_diffusers=False)
 
         with paddle.no_grad():
-            new_sample = new_model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         assert (sample - new_sample).abs().max() < 1e-4
 
@@ -265,24 +252,17 @@ def test_lora_save_load_safetensors(self):
         model.set_attn_processor(lora_attn_procs)
 
         with paddle.no_grad():
-            sample = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(
-                tmpdirname, safe_serialization=True, to_diffusers=True)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname,
-                                 "pytorch_lora_weights.safetensors")))
+            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(
-                tmpdirname, use_safetensors=True, from_diffusers=True)
+            new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
 
         with paddle.no_grad():
-            new_sample = new_model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
 
         assert (sample - new_sample).abs().max() < 1e-4
 
@@ -303,16 +283,15 @@ def test_lora_save_safetensors_load_torch(self):
         # Saving as paddle, properly reloads with directly filename
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_attn_procs(tmpdirname, to_diffusers=True)
-            self.assertTrue(
-                os.path.isfile(
-                    os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
             paddle.seed(0)
             new_model = self.model_class(**init_dict)
             new_model.load_attn_procs(
                 tmpdirname,
                 weight_name="pytorch_lora_weights.bin",
                 use_safetensors=False,
-                from_diffusers=True, )
+                from_diffusers=True,
+            )
 
     def test_lora_save_paddle_force_load_safetensors_error(self):
         pass
@@ -332,8 +311,7 @@ def test_lora_on_off(self):
         model.set_attn_processor(lora_attn_procs)
 
         with paddle.no_grad():
-            sample = model(
-                **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
 
         model.set_attn_processor(AttnProcessor())
 
diff --git a/ppdiffusers/tests/models/test_models_vae.py b/ppdiffusers/tests/models/test_models_vae.py
index 8cc3c0794fbd8..c385339e1b134 100644
--- a/ppdiffusers/tests/models/test_models_vae.py
+++ b/ppdiffusers/tests/models/test_models_vae.py
@@ -20,8 +20,13 @@
 from parameterized import parameterized
 
 from ppdiffusers import AutoencoderKL
-from ppdiffusers.utils import (floats_tensor, load_ppnlp_numpy,
-                               paddle_all_close, require_paddle_gpu, slow)
+from ppdiffusers.utils import (
+    floats_tensor,
+    load_ppnlp_numpy,
+    paddle_all_close,
+    require_paddle_gpu,
+    slow,
+)
 
 from .test_modeling_common import ModelTesterMixin
 
@@ -100,13 +105,10 @@ def test_gradient_checkpointing(self):
         named_params_2 = dict(model_2.named_parameters())
         with paddle.no_grad():
             for name, param in named_params.items():
-                self.assertTrue(
-                    paddle_all_close(
-                        param.grad, named_params_2[name].grad, atol=5e-5))
+                self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-5))
 
     def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained(
-            "fusing/autoencoder-kl-dummy", output_loading_info=True)
+        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
         image = model(**self.dummy_input)
@@ -124,25 +126,25 @@ def test_output_pretrained(self):
                 model.config.sample_size,
                 model.config.sample_size,
             ],
-            generator=paddle.Generator().manual_seed(0), )
+            generator=paddle.Generator().manual_seed(0),
+        )
         with paddle.no_grad():
-            output = model(
-                image, sample_posterior=True, generator=generator).sample
+            output = model(image, sample_posterior=True, generator=generator).sample
         output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor([
-            -0.39049336,
-            0.34836933,
-            0.27105471,
-            -0.02148458,
-            0.00975929,
-            0.27822807,
-            -0.12224892,
-            -0.02011922,
-            0.19761699,
-        ])
-        self.assertTrue(
-            paddle_all_close(
-                output_slice, expected_output_slice, rtol=0.01))
+        expected_output_slice = paddle.to_tensor(
+            [
+                -0.39049336,
+                0.34836933,
+                0.27105471,
+                -0.02148458,
+                0.00975929,
+                0.27822807,
+                -0.12224892,
+                -0.02011922,
+                0.19761699,
+            ]
+        )
+        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
 
 
 @slow
@@ -157,115 +159,77 @@ def tearDown(self):
 
     def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
         dtype = paddle.float16 if fp16 else paddle.float32
-        image = paddle.to_tensor(data=load_ppnlp_numpy(
-            self.get_file_format(seed, shape))).cast(dtype)
+        image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
         return image
 
-    def get_sd_vae_model(self,
-                         model_id="CompVis/stable-diffusion-v1-4",
-                         fp16=False):
+    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
         revision = "fp16" if fp16 else None
         paddle_dtype = paddle.float16 if fp16 else paddle.float32
-        model = AutoencoderKL.from_pretrained(
-            model_id,
-            subfolder="vae",
-            paddle_dtype=paddle_dtype,
-            revision=revision)
+        model = AutoencoderKL.from_pretrained(model_id, subfolder="vae", paddle_dtype=paddle_dtype, revision=revision)
         model.eval()
         return model
 
     def get_generator(self, seed=0):
         return paddle.Generator().manual_seed(seed)
 
-    @parameterized.expand([
-        [
-            33,
-            [
-                -0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206,
-                -0.0824
-            ],
-            [
-                -0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718,
-                -0.1824
-            ],
-        ],
+    @parameterized.expand(
         [
-            47,
             [
-                -0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493,
-                -0.4089
+                33,
+                [-0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206, -0.0824],
+                [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824],
             ],
             [
-                0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633,
-                -0.1131
+                47,
+                [-0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493, -0.4089],
+                [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
             ],
-        ],
-    ])
+        ]
+    )
     def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
         model = self.get_sd_vae_model()
         image = self.get_sd_image(seed)
         generator = self.get_generator(seed)
         with paddle.no_grad():
-            sample = model(
-                image, generator=generator, sample_posterior=True).sample
+            sample = model(image, generator=generator, sample_posterior=True).sample
         assert sample.shape == image.shape
         output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
-        [
-            33, [
-                -0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103,
-                -0.0999
-            ]
-        ],
+    @parameterized.expand(
         [
-            47, [
-                -0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334,
-                0.2247
-            ]
-        ],
-    ])
+            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
+            [47, [-0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334, 0.2247]],
+        ]
+    )
     @require_paddle_gpu
     def test_stable_diffusion_fp16(self, seed, expected_slice):
         model = self.get_sd_vae_model(fp16=True)
         image = self.get_sd_image(seed, fp16=True)
         generator = self.get_generator(seed)
         with paddle.no_grad():
-            sample = model(
-                image, generator=generator, sample_posterior=True).sample
+            sample = model(image, generator=generator, sample_posterior=True).sample
         assert sample.shape == image.shape
         output_slice = sample[-1, -2:, :2, -2:].flatten().cast("float32").cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            33,
             [
-                -0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055,
-                -0.0814
+                33,
+                [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814],
+                [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824],
             ],
             [
-                -0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718,
-                -0.1824
+                47,
+                [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085],
+                [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
             ],
-        ],
-        [
-            47,
-            [
-                -0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491,
-                -0.4085
-            ],
-            [
-                0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633,
-                -0.1131
-            ],
-        ],
-    ])
-    def test_stable_diffusion_mode(self, seed, expected_slice,
-                                   expected_slice_mps):
+        ]
+    )
+    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
         model = self.get_sd_vae_model()
         image = self.get_sd_image(seed)
         with paddle.no_grad():
@@ -275,28 +239,27 @@ def test_stable_diffusion_mode(self, seed, expected_slice,
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            13,
             [
-                -0.2051,
-                -0.1803,
-                -0.2311,
-                -0.2114,
-                -0.3292,
-                -0.3574,
-                -0.2953,
-                -0.3323,
+                13,
+                [
+                    -0.2051,
+                    -0.1803,
+                    -0.2311,
+                    -0.2114,
+                    -0.3292,
+                    -0.3574,
+                    -0.2953,
+                    -0.3323,
+                ],
             ],
-        ],
-        [
-            37,
             [
-                -0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372,
-                -0.4925
+                37,
+                [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372, -0.4925],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_stable_diffusion_decode(self, seed, expected_slice):
         model = self.get_sd_vae_model()
@@ -308,28 +271,27 @@ def test_stable_diffusion_decode(self, seed, expected_slice):
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            27,
             [
-                -0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465,
-                -0.2039
+                27,
+                [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465, -0.2039],
             ],
-        ],
-        [
-            16,
             [
-                -0.1628,
-                -0.2134,
-                -0.2747,
-                -0.2642,
-                -0.3774,
-                -0.4404,
-                -0.3687,
-                -0.4277,
+                16,
+                [
+                    -0.1628,
+                    -0.2134,
+                    -0.2747,
+                    -0.2642,
+                    -0.3774,
+                    -0.4404,
+                    -0.3687,
+                    -0.4277,
+                ],
             ],
-        ],
-    ])
+        ]
+    )
     @require_paddle_gpu
     def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
         model = self.get_sd_vae_model(fp16=True)
@@ -341,7 +303,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
         expected_output_slice = paddle.to_tensor(expected_slice)
         assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
 
-    @parameterized.expand([(13, ), (16, ), (27, )])
+    @parameterized.expand([(13,), (16,), (27,)])
     @require_paddle_gpu
     def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed):
         model = self.get_sd_vae_model(fp16=True)
@@ -358,7 +320,7 @@ def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed):
 
         assert paddle_all_close(sample, sample_2, atol=1e-1)
 
-    @parameterized.expand([(13, ), (16, ), (37, )])
+    @parameterized.expand([(13,), (16,), (37,)])
     @require_paddle_gpu
     def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed):
         model = self.get_sd_vae_model()
@@ -375,36 +337,38 @@ def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed):
 
         assert paddle_all_close(sample, sample_2, atol=1e-2)
 
-    @parameterized.expand([
+    @parameterized.expand(
         [
-            33,
             [
-                -0.3001,
-                0.0918,
-                -2.6984,
-                -3.972,
-                -3.2099,
-                -5.0353,
-                1.7338,
-                -0.2065,
-                3.4267,
+                33,
+                [
+                    -0.3001,
+                    0.0918,
+                    -2.6984,
+                    -3.972,
+                    -3.2099,
+                    -5.0353,
+                    1.7338,
+                    -0.2065,
+                    3.4267,
+                ],
             ],
-        ],
-        [
-            47,
             [
-                -1.503,
-                -4.3871,
-                -6.0355,
-                -9.1157,
-                -1.6661,
-                -2.7853,
-                2.1607,
-                -5.0823,
-                2.5633,
+                47,
+                [
+                    -1.503,
+                    -4.3871,
+                    -6.0355,
+                    -9.1157,
+                    -1.6661,
+                    -2.7853,
+                    2.1607,
+                    -5.0823,
+                    2.5633,
+                ],
             ],
-        ],
-    ])
+        ]
+    )
     def test_stable_diffusion_encode_sample(self, seed, expected_slice):
         model = self.get_sd_vae_model()
         image = self.get_sd_image(seed)
@@ -412,11 +376,8 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice):
         with paddle.no_grad():
             dist = model.encode(image).latent_dist
             sample = dist.sample(generator=generator)
-        assert list(sample.shape) == [image.shape[0], 4] + [
-            (i // 8) for i in image.shape[2:]
-        ]
+        assert list(sample.shape) == [image.shape[0], 4] + [(i // 8) for i in image.shape[2:]]
         output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
         expected_output_slice = paddle.to_tensor(expected_slice)
         tolerance = 0.01
-        assert paddle_all_close(
-            output_slice, expected_output_slice, atol=tolerance)
+        assert paddle_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/ppdiffusers/tests/models/test_models_vq.py b/ppdiffusers/tests/models/test_models_vq.py
index 9b19455a496b6..af2a6292d9353 100644
--- a/ppdiffusers/tests/models/test_models_vq.py
+++ b/ppdiffusers/tests/models/test_models_vq.py
@@ -60,8 +60,7 @@ def test_training(self):
         pass
 
     def test_from_pretrained_hub(self):
-        model, loading_info = VQModel.from_pretrained(
-            "fusing/vqgan-dummy", output_loading_info=True)
+        model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
         image = model(**self.dummy_input)
@@ -71,26 +70,28 @@ def test_output_pretrained(self):
         model = VQModel.from_pretrained("fusing/vqgan-dummy")
         model.eval()
         paddle.seed(0)
-        image = paddle.randn(shape=[
-            1,
-            model.config.in_channels,
-            model.config.sample_size,
-            model.config.sample_size,
-        ])
+        image = paddle.randn(
+            shape=[
+                1,
+                model.config.in_channels,
+                model.config.sample_size,
+                model.config.sample_size,
+            ]
+        )
         with paddle.no_grad():
             output = model(image).sample
         output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor([
-            -0.027147896587848663,
-            -0.41129639744758606,
-            -0.17730756103992462,
-            -0.5245445370674133,
-            -0.2423611730337143,
-            -0.3957087993621826,
-            -0.16461530327796936,
-            -0.06902074813842773,
-            -0.01736617460846901,
-        ])
-        self.assertTrue(
-            paddle.allclose(
-                output_slice, expected_output_slice, atol=0.01))
+        expected_output_slice = paddle.to_tensor(
+            [
+                -0.027147896587848663,
+                -0.41129639744758606,
+                -0.17730756103992462,
+                -0.5245445370674133,
+                -0.2423611730337143,
+                -0.3957087993621826,
+                -0.16461530327796936,
+                -0.06902074813842773,
+                -0.01736617460846901,
+            ]
+        )
+        self.assertTrue(paddle.allclose(output_slice, expected_output_slice, atol=0.01))
diff --git a/ppdiffusers/tests/models/test_unet_2d_blocks.py b/ppdiffusers/tests/models/test_unet_2d_blocks.py
index df1fdae9f4acf..cfb2100ee38ba 100644
--- a/ppdiffusers/tests/models/test_unet_2d_blocks.py
+++ b/ppdiffusers/tests/models/test_unet_2d_blocks.py
@@ -16,13 +16,28 @@
 import unittest
 
 from ppdiffusers.models.unet_2d_blocks import (
-    AttnDownBlock2D, AttnDownEncoderBlock2D, AttnSkipDownBlock2D,
-    AttnSkipUpBlock2D, AttnUpBlock2D, AttnUpDecoderBlock2D,
-    CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, DownEncoderBlock2D,
-    ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, SimpleCrossAttnDownBlock2D,
-    SimpleCrossAttnUpBlock2D, SkipDownBlock2D, SkipUpBlock2D, UNetMidBlock2D,
-    UNetMidBlock2DCrossAttn, UNetMidBlock2DSimpleCrossAttn, UpBlock2D,
-    UpDecoderBlock2D)
+    AttnDownBlock2D,
+    AttnDownEncoderBlock2D,
+    AttnSkipDownBlock2D,
+    AttnSkipUpBlock2D,
+    AttnUpBlock2D,
+    AttnUpDecoderBlock2D,
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    DownEncoderBlock2D,
+    ResnetDownsampleBlock2D,
+    ResnetUpsampleBlock2D,
+    SimpleCrossAttnDownBlock2D,
+    SimpleCrossAttnUpBlock2D,
+    SkipDownBlock2D,
+    SkipUpBlock2D,
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+    UpDecoderBlock2D,
+)
 
 from .test_unet_blocks_common import UNetBlockTesterMixin
 
@@ -89,8 +104,7 @@ class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
     block_type = "down"
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super(
-        ).prepare_init_args_and_inputs_for_common()
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
         init_dict["cross_attention_dim"] = 32
         return init_dict, inputs_dict
 
@@ -118,8 +132,7 @@ def dummy_input(self):
         return super().get_dummy_input(include_encoder_hidden_states=True)
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super(
-        ).prepare_init_args_and_inputs_for_common()
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
         init_dict["cross_attention_dim"] = 32
         return init_dict, inputs_dict
 
@@ -269,8 +282,7 @@ class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
     block_type = "mid"
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super(
-        ).prepare_init_args_and_inputs_for_common()
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
         init_dict["cross_attention_dim"] = 32
         return init_dict, inputs_dict
 
@@ -289,8 +301,7 @@ def test_output(self):
         super().test_output(expected_slice)
 
 
-class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin,
-                                         unittest.TestCase):
+class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
     block_class = UNetMidBlock2DSimpleCrossAttn
     block_type = "mid"
 
@@ -299,8 +310,7 @@ def dummy_input(self):
         return super().get_dummy_input(include_encoder_hidden_states=True)
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super(
-        ).prepare_init_args_and_inputs_for_common()
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
         init_dict["cross_attention_dim"] = 32
         return init_dict, inputs_dict
 
@@ -374,8 +384,7 @@ def dummy_input(self):
         return super().get_dummy_input(include_res_hidden_states_tuple=True)
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super(
-        ).prepare_init_args_and_inputs_for_common()
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
         init_dict["cross_attention_dim"] = 32
         return init_dict, inputs_dict
 
@@ -400,13 +409,10 @@ class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
 
     @property
     def dummy_input(self):
-        return super().get_dummy_input(
-            include_res_hidden_states_tuple=True,
-            include_encoder_hidden_states=True)
+        return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True)
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super(
-        ).prepare_init_args_and_inputs_for_common()
+        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
         init_dict["cross_attention_dim"] = 32
         return init_dict, inputs_dict
 
diff --git a/ppdiffusers/tests/models/test_unet_blocks_common.py b/ppdiffusers/tests/models/test_unet_blocks_common.py
index 9f0920c87ef10..4595f43aec64d 100644
--- a/ppdiffusers/tests/models/test_unet_blocks_common.py
+++ b/ppdiffusers/tests/models/test_unet_blocks_common.py
@@ -35,16 +35,15 @@ def output_shape(self):
             return 4, 32, 32, 32
         elif self.block_type == "up":
             return 4, 32, 64, 64
-        raise ValueError(
-            f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'."
-        )
+        raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.")
 
     def get_dummy_input(
-            self,
-            include_temb=True,
-            include_res_hidden_states_tuple=False,
-            include_encoder_hidden_states=False,
-            include_skip_sample=False, ):
+        self,
+        include_temb=True,
+        include_res_hidden_states_tuple=False,
+        include_encoder_hidden_states=False,
+        include_skip_sample=False,
+    ):
         batch_size = 4
         num_channels = 32
         sizes = 32, 32
@@ -54,28 +53,20 @@ def get_dummy_input(
         dummy_input = {"hidden_states": hidden_states}
         if include_temb:
             temb_channels = 128
-            dummy_input["temb"] = randn_tensor(
-                (batch_size, temb_channels), generator=generator)
+            dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator)
         if include_res_hidden_states_tuple:
             generator_1 = paddle.Generator().manual_seed(1)
-            dummy_input["res_hidden_states_tuple"] = (randn_tensor(
-                shape, generator=generator_1), )
+            dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1),)
         if include_encoder_hidden_states:
-            dummy_input["encoder_hidden_states"] = floats_tensor(
-                (batch_size, 32, 32))
+            dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32))
         if include_skip_sample:
-            dummy_input["skip_sample"] = randn_tensor(
-                (batch_size, 3) + sizes, generator=generator)
+            dummy_input["skip_sample"] = randn_tensor((batch_size, 3) + sizes, generator=generator)
 
         paddle.seed(0)
         return dummy_input
 
     def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 32,
-            "out_channels": 32,
-            "temb_channels": 128
-        }
+        init_dict = {"in_channels": 32, "out_channels": 32, "temb_channels": 128}
         if self.block_type == "up":
             init_dict["prev_output_channel"] = 32
         if self.block_type == "mid":
@@ -94,8 +85,7 @@ def test_output(self, expected_slice):
         self.assertEqual(list(output.shape), list(self.output_shape))
         output_slice = output[0, -1, -3:, -3:]
         expected_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(
-            output_slice.flatten(), expected_slice, atol=0.005)
+        assert paddle_all_close(output_slice.flatten(), expected_slice, atol=0.005)
 
     def test_training(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
diff --git a/ppdiffusers/tests/others/test_config.py b/ppdiffusers/tests/others/test_config.py
index 171d2ea28e771..e4637ce2c35a3 100644
--- a/ppdiffusers/tests/others/test_config.py
+++ b/ppdiffusers/tests/others/test_config.py
@@ -16,10 +16,15 @@
 import tempfile
 import unittest
 
-from ppdiffusers import (DDIMScheduler, DDPMScheduler,
-                         DPMSolverMultistepScheduler,
-                         EulerAncestralDiscreteScheduler,
-                         EulerDiscreteScheduler, PNDMScheduler, logging)
+from ppdiffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    PNDMScheduler,
+    logging,
+)
 from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
 from ppdiffusers.utils.testing_utils import CaptureLogger
 
@@ -44,13 +49,7 @@ class SampleObject3(ConfigMixin):
     config_name = "config.json"
 
     @register_to_config
-    def __init__(self,
-                 a=2,
-                 b=5,
-                 c=(2, 5),
-                 d="for diffusion",
-                 e=[1, 3],
-                 f=[1, 3]):
+    def __init__(self, a=2, b=5, c=(2, 5), d="for diffusion", e=[1, 3], f=[1, 3]):
         pass
 
 
@@ -99,8 +98,7 @@ def test_save_load(self):
         assert config["e"] == [1, 3]
         with tempfile.TemporaryDirectory() as tmpdirname:
             obj.save_config(tmpdirname)
-            new_obj = SampleObject.from_config(
-                SampleObject.load_config(tmpdirname))
+            new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname))
             new_config = new_obj.config
         config = dict(config)
         new_config = dict(new_config)
@@ -114,8 +112,8 @@ def test_load_ddim_from_pndm(self):
         logger.setLevel(30)
         with CaptureLogger(logger) as cap_logger:
             ddim = DDIMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler")
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
         assert ddim.__class__ == DDIMScheduler
         assert cap_logger.out == ""
 
@@ -125,8 +123,8 @@ def test_load_euler_from_pndm(self):
         logger.setLevel(30)
         with CaptureLogger(logger) as cap_logger:
             euler = EulerDiscreteScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler")
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
         assert euler.__class__ == EulerDiscreteScheduler
         assert cap_logger.out == ""
 
@@ -136,8 +134,8 @@ def test_load_euler_ancestral_from_pndm(self):
         logger.setLevel(30)
         with CaptureLogger(logger) as cap_logger:
             euler = EulerAncestralDiscreteScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler")
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
         assert euler.__class__ == EulerAncestralDiscreteScheduler
         assert cap_logger.out == ""
 
@@ -147,8 +145,8 @@ def test_load_pndm(self):
         logger.setLevel(30)
         with CaptureLogger(logger) as cap_logger:
             pndm = PNDMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler")
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
         assert pndm.__class__ == PNDMScheduler
         assert cap_logger.out == ""
 
@@ -161,10 +159,10 @@ def test_overwrite_config_on_load(self):
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="scheduler",
                 prediction_type="sample",
-                beta_end=8, )
+                beta_end=8,
+            )
         with CaptureLogger(logger) as cap_logger_2:
-            ddpm_2 = DDPMScheduler.from_pretrained(
-                "google/ddpm-celebahq-256", beta_start=88)
+            ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
         assert ddpm.__class__ == DDPMScheduler
         assert ddpm.config.prediction_type == "sample"
         assert ddpm.config.beta_end == 8
@@ -178,7 +176,7 @@ def test_load_dpmsolver(self):
         logger.setLevel(30)
         with CaptureLogger(logger) as cap_logger:
             dpm = DPMSolverMultistepScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler")
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
         assert dpm.__class__ == DPMSolverMultistepScheduler
         assert cap_logger.out == ""
diff --git a/ppdiffusers/tests/others/test_ema.py b/ppdiffusers/tests/others/test_ema.py
index 1ed2044e555e2..e8bd66abcfbee 100644
--- a/ppdiffusers/tests/others/test_ema.py
+++ b/ppdiffusers/tests/others/test_ema.py
@@ -33,13 +33,13 @@ class EMAModelTests(unittest.TestCase):
     generator = paddle.Generator().manual_seed(0)
 
     def get_models(self, decay=0.9999):
-        unet = UNet2DConditionModel.from_pretrained(
-            self.model_id, subfolder="unet")
+        unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet")
         ema_unet = EMAModel(
             unet.parameters(),
             decay=decay,
             model_cls=UNet2DConditionModel,
-            model_config=unet.config, )
+            model_config=unet.config,
+        )
         return unet, ema_unet
 
     def get_dummy_inputs(self):
@@ -48,21 +48,23 @@ def get_dummy_inputs(self):
                 self.batch_size,
                 self.num_in_channels,
                 self.latent_height,
-                self.latent_width, ),
-            generator=self.generator, )
-        timesteps = paddle.randint(
-            0, 1000, shape=(self.batch_size, ), generator=self.generator)
+                self.latent_width,
+            ),
+            generator=self.generator,
+        )
+        timesteps = paddle.randint(0, 1000, shape=(self.batch_size,), generator=self.generator)
         encoder_hidden_states = paddle.randn(
             (self.batch_size, self.prompt_length, self.text_encoder_hidden_dim),
-            generator=self.generator, )
+            generator=self.generator,
+        )
         return noisy_latents, timesteps, encoder_hidden_states
 
     def simulate_backprop(self, unet):
         updated_state_dict = {}
         for k, param in unet.state_dict().items():
-            updated_param = paddle.randn(
-                param.shape, dtype=param.dtype) + (param * paddle.randn(
-                    param.shape, dtype=param.dtype))
+            updated_param = paddle.randn(param.shape, dtype=param.dtype) + (
+                param * paddle.randn(param.shape, dtype=param.dtype)
+            )
             updated_state_dict.update({k: updated_param})
         unet.load_dict(updated_state_dict)
         return unet
@@ -131,8 +133,7 @@ def test_consecutive_shadow_params_updated(self):
         ema_unet.step(unet_step_two.parameters())
         step_two_shadow_params = ema_unet.shadow_params
 
-        for step_one, step_two in zip(step_one_shadow_params,
-                                      step_two_shadow_params):
+        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
             assert not paddle.allclose(step_one, step_two)
 
     def test_zero_decay(self):
@@ -148,23 +149,19 @@ def test_zero_decay(self):
         ema_unet.step(unet_step_two.parameters())
         step_two_shadow_params = ema_unet.shadow_params
 
-        for step_one, step_two in zip(step_one_shadow_params,
-                                      step_two_shadow_params):
+        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
             assert paddle.allclose(step_one, step_two)
 
     def test_serialization(self):
         unet, ema_unet = self.get_models()
-        noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs(
-        )
+        noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             ema_unet.save_pretrained(tmpdir)
-            loaded_unet = UNet2DConditionModel.from_pretrained(
-                tmpdir, model_cls=UNet2DConditionModel)
+            loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel)
 
         # Since no EMA step has been performed the outputs should match.
         output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-        output_loaded = loaded_unet(noisy_latents, timesteps,
-                                    encoder_hidden_states).sample
+        output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
         assert paddle.allclose(output, output_loaded, atol=1e-4)
diff --git a/ppdiffusers/tests/others/test_image_processor.py b/ppdiffusers/tests/others/test_image_processor.py
index 054fe2b955ca9..e0c88c40e56b4 100644
--- a/ppdiffusers/tests/others/test_image_processor.py
+++ b/ppdiffusers/tests/others/test_image_processor.py
@@ -50,10 +50,10 @@ def test_vae_image_processor_pd(self):
         for output_type in ["pd", "np", "pil"]:
             out = image_processor.postprocess(
                 image_processor.preprocess(input_pd),
-                output_type=output_type, )
+                output_type=output_type,
+            )
             out_np = self.to_np(out)
-            in_np = (input_np *
-                     255).round() if output_type == "pil" else input_np
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
             assert (
                 np.abs(in_np - out_np).max() < 1e-6
             ), f"decoded output does not match input for output_type {output_type}"
@@ -63,12 +63,10 @@ def test_vae_image_processor_np(self):
         input_np = self.dummy_sample.transpose([0, 2, 3, 1]).cpu().numpy()
 
         for output_type in ["pd", "np", "pil"]:
-            out = image_processor.postprocess(
-                image_processor.preprocess(input_np), output_type=output_type)
+            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
 
             out_np = self.to_np(out)
-            in_np = (input_np *
-                     255).round() if output_type == "pil" else input_np
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
             assert (
                 np.abs(in_np - out_np).max() < 1e-6
             ), f"decoded output does not match input for output_type {output_type}"
@@ -80,12 +78,10 @@ def test_vae_image_processor_pil(self):
         input_pil = image_processor.numpy_to_pil(input_np)
 
         for output_type in ["pd", "np", "pil"]:
-            out = image_processor.postprocess(
-                image_processor.preprocess(input_pil), output_type=output_type)
+            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
             for i, o in zip(input_pil, out):
                 in_np = np.array(i)
-                out_np = (self.to_np(out) if output_type == "pil" else
-                          (self.to_np(out) * 255).round())
+                out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
                 assert (
                     np.abs(in_np - out_np).max() < 1e-6
                 ), f"decoded output does not match input for output_type {output_type}"
@@ -98,20 +94,24 @@ def test_preprocess_input_3d(self):
 
         out_pt_4d = image_processor.postprocess(
             image_processor.preprocess(input_pd_4d),
-            output_type="np", )
+            output_type="np",
+        )
         out_pt_3d = image_processor.postprocess(
             image_processor.preprocess(input_pd_3d),
-            output_type="np", )
+            output_type="np",
+        )
 
         input_np_4d = self.to_np(self.dummy_sample)
         input_np_3d = input_np_4d.squeeze(0)
 
         out_np_4d = image_processor.postprocess(
             image_processor.preprocess(input_np_4d),
-            output_type="np", )
+            output_type="np",
+        )
         out_np_3d = image_processor.postprocess(
             image_processor.preprocess(input_np_3d),
-            output_type="np", )
+            output_type="np",
+        )
 
         assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
         assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
@@ -124,22 +124,26 @@ def test_preprocess_input_list(self):
 
         out_pt_4d = image_processor.postprocess(
             image_processor.preprocess(input_pd_4d),
-            output_type="np", )
+            output_type="np",
+        )
 
         out_pt_list = image_processor.postprocess(
             image_processor.preprocess(input_pd_list),
-            output_type="np", )
+            output_type="np",
+        )
 
         input_np_4d = self.to_np(self.dummy_sample)
         list(input_np_4d)
 
         out_np_4d = image_processor.postprocess(
             image_processor.preprocess(input_pd_4d),
-            output_type="np", )
+            output_type="np",
+        )
 
         out_np_list = image_processor.postprocess(
             image_processor.preprocess(input_pd_list),
-            output_type="np", )
+            output_type="np",
+        )
 
         assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
         assert np.abs(out_np_4d - out_np_list).max() < 1e-6
diff --git a/ppdiffusers/tests/others/test_training.py b/ppdiffusers/tests/others/test_training.py
index c52c0988951f2..12b72686eaed6 100644
--- a/ppdiffusers/tests/others/test_training.py
+++ b/ppdiffusers/tests/others/test_training.py
@@ -17,8 +17,7 @@
 
 import paddle
 
-from ppdiffusers import (DDIMScheduler, DDPMScheduler, UNet2DConditionModel,
-                         UNet2DModel)
+from ppdiffusers import DDIMScheduler, DDPMScheduler, UNet2DConditionModel, UNet2DModel
 from ppdiffusers.training_utils import set_seed
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 from ppdiffusers.utils.testing_utils import slow
@@ -27,10 +26,8 @@
 class UNet2DModelTrainingTests(unittest.TestCase):
     def get_model_optimizer(self, resolution=32):
         set_seed(0)
-        model = UNet2DModel(
-            sample_size=resolution, in_channels=3, out_channels=3)
-        optimizer = paddle.optimizer.SGD(parameters=model.parameters(),
-                                         learning_rate=0.0001)
+        model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3)
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=0.0001)
         return model, optimizer
 
     @slow
@@ -40,34 +37,27 @@ def test_training_step_equality(self):
             beta_start=0.0001,
             beta_end=0.02,
             beta_schedule="linear",
-            clip_sample=True, )
+            clip_sample=True,
+        )
         ddim_scheduler = DDIMScheduler(
             num_train_timesteps=1000,
             beta_start=0.0001,
             beta_end=0.02,
             beta_schedule="linear",
-            clip_sample=True, )
-        assert (ddpm_scheduler.config.num_train_timesteps ==
-                ddim_scheduler.config.num_train_timesteps)
+            clip_sample=True,
+        )
+        assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
         set_seed(0)
-        clean_images = [
-            paddle.randn(shape=(4, 3, 32, 32)).clip(
-                min=-1, max=1) for _ in range(4)
-        ]
+        clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
         noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
-        timesteps = [
-            paddle.randint(0, 1000, (4, )).astype(dtype="int64")
-            for _ in range(4)
-        ]
+        timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
         model, optimizer = self.get_model_optimizer(resolution=32)
         model.train()
         for i in range(4):
             optimizer.clear_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i],
-                                                         noise[i], timesteps[i])
+            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
             ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample
-            loss = paddle.nn.functional.mse_loss(
-                input=ddpm_noise_pred, label=noise[i])
+            loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
             loss.backward()
             optimizer.step()
         del model, optimizer
@@ -75,30 +65,22 @@ def test_training_step_equality(self):
         model.train()
         for i in range(4):
             optimizer.clear_grad()
-            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i],
-                                                         noise[i], timesteps[i])
+            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
             ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample
-            loss = paddle.nn.functional.mse_loss(
-                input=ddim_noise_pred, label=noise[i])
+            loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i])
             loss.backward()
             optimizer.step()
         del model, optimizer
-        self.assertTrue(
-            paddle.allclose(
-                ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
-        self.assertTrue(
-            paddle.allclose(
-                ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
+        self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
+        self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
 
 
 # new added
 class UNet2DConditionModelTrainingTests(unittest.TestCase):
     def get_model_optimizer(self, resolution=32):
         set_seed(0)
-        model = UNet2DConditionModel(
-            sample_size=resolution, in_channels=3, out_channels=3)
-        optimizer = paddle.optimizer.AdamW(
-            parameters=model.parameters(), learning_rate=0.0001)
+        model = UNet2DConditionModel(sample_size=resolution, in_channels=3, out_channels=3)
+        optimizer = paddle.optimizer.AdamW(parameters=model.parameters(), learning_rate=0.0001)
         return model, optimizer
 
     @slow
@@ -107,37 +89,31 @@ def test_training_step_equality(self):
             num_train_timesteps=1000,
             beta_start=0.00085,
             beta_end=0.012,
-            beta_schedule="scaled_linear", )
+            beta_schedule="scaled_linear",
+        )
         ddim_scheduler = DDIMScheduler(
             num_train_timesteps=1000,
             beta_start=0.00085,
             beta_end=0.012,
-            beta_schedule="scaled_linear", )
-        assert (ddpm_scheduler.config.num_train_timesteps ==
-                ddim_scheduler.config.num_train_timesteps)
+            beta_schedule="scaled_linear",
+        )
+        assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
         set_seed(0)
-        clean_images = [
-            paddle.randn(shape=(4, 3, 32, 32)).clip(
-                min=-1, max=1) for _ in range(4)
-        ]
+        clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
         noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
         text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)]
-        timesteps = [
-            paddle.randint(0, 1000, (4, )).astype(dtype="int64")
-            for _ in range(4)
-        ]
+        timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
         model, optimizer = self.get_model_optimizer(resolution=32)
         model.train()
         for i in range(4):
             optimizer.clear_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i],
-                                                         noise[i], timesteps[i])
+            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
             ddpm_noise_pred = model(
                 ddpm_noisy_images,
                 timesteps[i],
-                encoder_hidden_states=text_embeddings[i], ).sample
-            loss = paddle.nn.functional.mse_loss(
-                input=ddpm_noise_pred, label=noise[i])
+                encoder_hidden_states=text_embeddings[i],
+            ).sample
+            loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
             loss.backward()
             optimizer.step()
         del model, optimizer
@@ -145,23 +121,18 @@ def test_training_step_equality(self):
         model.train()
         for i in range(4):
             optimizer.clear_grad()
-            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i],
-                                                         noise[i], timesteps[i])
+            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
             ddim_noise_pred = model(
                 ddim_noisy_images,
                 timesteps[i],
-                encoder_hidden_states=text_embeddings[i], ).sample
-            loss = paddle.nn.functional.mse_loss(
-                input=ddim_noise_pred, label=noise[i])
+                encoder_hidden_states=text_embeddings[i],
+            ).sample
+            loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i])
             loss.backward()
             optimizer.step()
         del model, optimizer
-        self.assertTrue(
-            paddle.allclose(
-                ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
-        self.assertTrue(
-            paddle.allclose(
-                ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
+        self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
+        self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
 
     @unittest.skipIf(
         not is_ppxformers_available(),
@@ -173,17 +144,12 @@ def test_recompute_xformers_training(self):
             num_train_timesteps=1000,
             beta_start=0.00085,
             beta_end=0.012,
-            beta_schedule="scaled_linear", )
+            beta_schedule="scaled_linear",
+        )
         set_seed(0)
-        clean_images = [
-            paddle.randn(shape=(4, 3, 32, 32)).clip(
-                min=-1, max=1) for _ in range(4)
-        ]
+        clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
         noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
-        timesteps = [
-            paddle.randint(0, 1000, (4, )).astype(dtype="int64")
-            for _ in range(4)
-        ]
+        timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
         text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)]
         model, optimizer = self.get_model_optimizer(resolution=32)
         model.enable_gradient_checkpointing()
@@ -191,13 +157,12 @@ def test_recompute_xformers_training(self):
         model.train()
         for i in range(4):
             optimizer.clear_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i],
-                                                         noise[i], timesteps[i])
+            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
             ddpm_noise_pred = model(
                 ddpm_noisy_images,
                 timesteps[i],
-                encoder_hidden_states=text_embeddings[i], ).sample
-            loss = paddle.nn.functional.mse_loss(
-                input=ddpm_noise_pred, label=noise[i])
+                encoder_hidden_states=text_embeddings[i],
+            ).sample
+            loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
             loss.backward()
             optimizer.step()
diff --git a/ppdiffusers/tests/others/test_utils.py b/ppdiffusers/tests/others/test_utils.py
index 870e791a6f54b..ae27388bf5f60 100644
--- a/ppdiffusers/tests/others/test_utils.py
+++ b/ppdiffusers/tests/others/test_utils.py
@@ -20,34 +20,27 @@
 
 
 class DeprecateTester(unittest.TestCase):
-    higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] +
-                              __version__.split(".")[1:])
+    higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:])
     lower_version = "0.0.1"
 
     def test_deprecate_function_arg(self):
         kwargs = {"deprecated_arg": 4}
         with self.assertWarns(FutureWarning) as warning:
-            output = deprecate(
-                "deprecated_arg",
-                self.higher_version,
-                "message",
-                take_from=kwargs)
+            output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs)
         assert output == 4
         assert (
-            str(warning.warning) ==
-            f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warning)
+            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
         )
 
     def test_deprecate_function_arg_tuple(self):
         kwargs = {"deprecated_arg": 4}
         with self.assertWarns(FutureWarning) as warning:
-            output = deprecate(
-                ("deprecated_arg", self.higher_version, "message"),
-                take_from=kwargs)
+            output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs)
         assert output == 4
         assert (
-            str(warning.warning) ==
-            f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warning)
+            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
         )
 
     def test_deprecate_function_args(self):
@@ -56,49 +49,48 @@ def test_deprecate_function_args(self):
             output_1, output_2 = deprecate(
                 ("deprecated_arg_1", self.higher_version, "Hey"),
                 ("deprecated_arg_2", self.higher_version, "Hey"),
-                take_from=kwargs, )
+                take_from=kwargs,
+            )
         assert output_1 == 4
         assert output_2 == 8
         assert (
-            str(warning.warnings[0].message) ==
-            f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey"
+            str(warning.warnings[0].message)
+            == f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey"
         )
         assert (
-            str(warning.warnings[1].message) ==
-            f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey"
+            str(warning.warnings[1].message)
+            == f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey"
         )
 
     def test_deprecate_function_incorrect_arg(self):
         kwargs = {"deprecated_arg": 4}
         with self.assertRaises(TypeError) as error:
-            deprecate(
-                ("wrong_arg", self.higher_version, "message"), take_from=kwargs)
-        assert "test_deprecate_function_incorrect_arg in" in str(
-            error.exception)
+            deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs)
+        assert "test_deprecate_function_incorrect_arg in" in str(error.exception)
         assert "line" in str(error.exception)
-        assert "got an unexpected keyword argument `deprecated_arg`" in str(
-            error.exception)
+        assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception)
 
     def test_deprecate_arg_no_kwarg(self):
         with self.assertWarns(FutureWarning) as warning:
             deprecate(("deprecated_arg", self.higher_version, "message"))
         assert (
-            str(warning.warning) ==
-            f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warning)
+            == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
         )
 
     def test_deprecate_args_no_kwarg(self):
         with self.assertWarns(FutureWarning) as warning:
             deprecate(
                 ("deprecated_arg_1", self.higher_version, "Hey"),
-                ("deprecated_arg_2", self.higher_version, "Hey"), )
+                ("deprecated_arg_2", self.higher_version, "Hey"),
+            )
         assert (
-            str(warning.warnings[0].message) ==
-            f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
+            str(warning.warnings[0].message)
+            == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
         )
         assert (
-            str(warning.warnings[1].message) ==
-            f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
+            str(warning.warnings[1].message)
+            == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
         )
 
     def test_deprecate_class_obj(self):
@@ -106,12 +98,11 @@ class Args:
             arg = 5
 
         with self.assertWarns(FutureWarning) as warning:
-            arg = deprecate(
-                ("arg", self.higher_version, "message"), take_from=Args())
+            arg = deprecate(("arg", self.higher_version, "message"), take_from=Args())
         assert arg == 5
         assert (
-            str(warning.warning) ==
-            f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warning)
+            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
         )
 
     def test_deprecate_class_objs(self):
@@ -124,45 +115,45 @@ class Args:
                 ("arg", self.higher_version, "message"),
                 ("foo", self.higher_version, "message"),
                 ("does not exist", self.higher_version, "message"),
-                take_from=Args(), )
+                take_from=Args(),
+            )
         assert arg_1 == 5
         assert arg_2 == 7
         assert (
-            str(warning.warning) ==
-            f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warning)
+            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
         )
         assert (
-            str(warning.warnings[0].message) ==
-            f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warnings[0].message)
+            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
         )
         assert (
-            str(warning.warnings[1].message) ==
-            f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
+            str(warning.warnings[1].message)
+            == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
         )
 
     def test_deprecate_incorrect_version(self):
         kwargs = {"deprecated_arg": 4}
         with self.assertRaises(ValueError) as error:
-            deprecate(
-                ("wrong_arg", self.lower_version, "message"), take_from=kwargs)
+            deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs)
         assert (
-            str(error.exception) ==
-            f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}"
+            str(error.exception)
+            == f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}"
         )
 
     def test_deprecate_incorrect_no_standard_warn(self):
         with self.assertWarns(FutureWarning) as warning:
             deprecate(
-                ("deprecated_arg", self.higher_version,
-                 "This message is better!!!"),
-                standard_warn=False, )
+                ("deprecated_arg", self.higher_version, "This message is better!!!"),
+                standard_warn=False,
+            )
         assert str(warning.warning) == "This message is better!!!"
 
     def test_deprecate_stacklevel(self):
         with self.assertWarns(FutureWarning) as warning:
             deprecate(
-                ("deprecated_arg", self.higher_version,
-                 "This message is better!!!"),
-                standard_warn=False, )
+                ("deprecated_arg", self.higher_version, "This message is better!!!"),
+                standard_warn=False,
+            )
         assert str(warning.warning) == "This message is better!!!"
         assert "test_utils.py" in warning.filename
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
index f0804e24b9b35..e49767c5a033b 100644
--- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -18,14 +18,20 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
-                                    XLMRobertaTokenizer)
+from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
 
 import ppdiffusers  # noqa F401
-from ppdiffusers import (AltDiffusionPipeline, AutoencoderKL, DDIMScheduler,
-                         PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+    AltDiffusionPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig, RobertaSeriesModelWithTransformation)
+    RobertaSeriesConfig,
+    RobertaSeriesModelWithTransformation,
+)
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -48,13 +54,15 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -62,7 +70,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -74,11 +83,12 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=5002, )
+            vocab_size=5002,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
         tokenizer = XLMRobertaTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-xlm-roberta",
-            model_max_length=77)  # must set model_max_length 77 here
+            "hf-internal-testing/tiny-xlm-roberta", model_max_length=77
+        )  # must set model_max_length 77 here
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -111,9 +121,9 @@ def test_alt_diffusion_ddim(self):
             layer_norm_eps=1e-05,
             num_attention_heads=4,
             num_hidden_layers=5,
-            vocab_size=5002, )
-        text_encoder = RobertaSeriesModelWithTransformation(
-            text_encoder_config).eval()
+            vocab_size=5002,
+        )
+        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval()
         components["text_encoder"] = text_encoder
         alt_pipe = AltDiffusionPipeline(**components)
         alt_pipe.set_progress_bar_config(disable=None)
@@ -123,17 +133,19 @@ def test_alt_diffusion_ddim(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.32336113,
-            0.2371237,
-            0.34009337,
-            0.22972241,
-            0.23742735,
-            0.4925817,
-            0.22020563,
-            0.20505491,
-            0.43374813,
-        ])
+        expected_slice = np.array(
+            [
+                0.32336113,
+                0.2371237,
+                0.34009337,
+                0.22972241,
+                0.23742735,
+                0.4925817,
+                0.22020563,
+                0.20505491,
+                0.43374813,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_alt_diffusion_pndm(self):
@@ -147,9 +159,9 @@ def test_alt_diffusion_pndm(self):
             layer_norm_eps=1e-05,
             num_attention_heads=4,
             num_hidden_layers=5,
-            vocab_size=5002, )
-        text_encoder = RobertaSeriesModelWithTransformation(
-            text_encoder_config).eval()
+            vocab_size=5002,
+        )
+        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval()
         components["text_encoder"] = text_encoder
         alt_pipe = AltDiffusionPipeline(**components)
         alt_pipe.set_progress_bar_config(disable=None)
@@ -158,17 +170,19 @@ def test_alt_diffusion_pndm(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.24095133,
-            0.26875997,
-            0.34291863,
-            0.2529385,
-            0.2736602,
-            0.49928105,
-            0.23973131,
-            0.21133915,
-            0.41810605,
-        ])
+        expected_slice = np.array(
+            [
+                0.24095133,
+                0.26875997,
+                0.34291863,
+                0.2529385,
+                0.2736602,
+                0.49928105,
+                0.23973131,
+                0.21133915,
+                0.41810605,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
 
@@ -181,8 +195,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_alt_diffusion(self):
-        alt_pipe = AltDiffusionPipeline.from_pretrained(
-            "BAAI/AltDiffusion", safety_checker=None)
+        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
         alt_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -191,48 +204,47 @@ def test_alt_diffusion(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=20,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.8718514442443848,
-            0.8715569972991943,
-            0.8748429417610168,
-            0.8708409070968628,
-            0.8782679438591003,
-            0.8931069374084473,
-            0.883078932762146,
-            0.881088376045227,
-            0.8617547154426575,
-        ])
+        expected_slice = np.array(
+            [
+                0.8718514442443848,
+                0.8715569972991943,
+                0.8748429417610168,
+                0.8708409070968628,
+                0.8782679438591003,
+                0.8931069374084473,
+                0.883078932762146,
+                0.881088376045227,
+                0.8617547154426575,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_alt_diffusion_fast_ddim(self):
-        scheduler = DDIMScheduler.from_pretrained(
-            "BAAI/AltDiffusion", subfolder="scheduler")
-        alt_pipe = AltDiffusionPipeline.from_pretrained(
-            "BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
+        scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
+        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
         alt_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
-        output = alt_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="numpy")
+        output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.9265012741088867,
-            0.9305188059806824,
-            0.8999797105789185,
-            0.9346827268600464,
-            0.9264709949493408,
-            0.9447494745254517,
-            0.9428927898406982,
-            0.9417785406112671,
-            0.9157286882400513,
-        ])
+        expected_slice = np.array(
+            [
+                0.9265012741088867,
+                0.9305188059806824,
+                0.8999797105789185,
+                0.9346827268600464,
+                0.9264709949493408,
+                0.9447494745254517,
+                0.9428927898406982,
+                0.9417785406112671,
+                0.9157286882400513,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index ca070f3ff45ee..1422ec516f01d 100644
--- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -22,11 +22,17 @@
 from paddlenlp.transformers import XLMRobertaTokenizer
 
 import ppdiffusers  # noqa F401
-from ppdiffusers import (AltDiffusionImg2ImgPipeline, AutoencoderKL,
-                         PNDMScheduler, UNet2DConditionModel)
+from ppdiffusers import (
+    AltDiffusionImg2ImgPipeline,
+    AutoencoderKL,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.image_processor import VaeImageProcessor
 from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig, RobertaSeriesModelWithTransformation)
+    RobertaSeriesConfig,
+    RobertaSeriesModelWithTransformation,
+)
 from ppdiffusers.utils import floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -42,8 +48,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = 32, 32
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     @property
@@ -57,7 +62,8 @@ def dummy_cond_unet(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         return model
 
     @property
@@ -69,7 +75,8 @@ def dummy_vae(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         return model
 
     @property
@@ -83,7 +90,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=5006, )
+            vocab_size=5006,
+        )
         return RobertaSeriesModelWithTransformation(config)
 
     @property
@@ -106,8 +114,7 @@ def test_stable_diffusion_img2img_default_case(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-xlm-roberta")
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
         tokenizer.model_max_length = 77
         init_image = self.dummy_image
         alt_pipe = AltDiffusionImg2ImgPipeline(
@@ -117,9 +124,9 @@ def test_stable_diffusion_img2img_default_case(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
-        alt_pipe.image_processor = VaeImageProcessor(
-            vae_scale_factor=alt_pipe.vae_scale_factor)
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor)
         alt_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -129,7 +136,8 @@ def test_stable_diffusion_img2img_default_case(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            image=init_image, )
+            image=init_image,
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = alt_pipe(
@@ -139,24 +147,26 @@ def test_stable_diffusion_img2img_default_case(self):
             num_inference_steps=2,
             output_type="np",
             image=init_image,
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.48931587,
-            0.40102208,
-            0.49653798,
-            0.4203022,
-            0.34621224,
-            0.50789315,
-            0.41116416,
-            0.4933398,
-            0.5465742,
-        ])
+        expected_slice = np.array(
+            [
+                0.48931587,
+                0.40102208,
+                0.49653798,
+                0.4203022,
+                0.34621224,
+                0.50789315,
+                0.41116416,
+                0.4933398,
+                0.5465742,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.005
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.005
 
     def test_stable_diffusion_img2img_fp16(self):
         """Test that stable diffusion img2img works with fp16"""
@@ -164,8 +174,7 @@ def test_stable_diffusion_img2img_fp16(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-xlm-roberta")
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
         tokenizer.model_max_length = 77
         init_image = self.dummy_image
         unet = unet.to(dtype=paddle.float16)
@@ -178,9 +187,9 @@ def test_stable_diffusion_img2img_fp16(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
-        alt_pipe.image_processor = VaeImageProcessor(
-            vae_scale_factor=alt_pipe.vae_scale_factor)
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor)
         alt_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -189,7 +198,8 @@ def test_stable_diffusion_img2img_fp16(self):
             generator=generator,
             num_inference_steps=2,
             output_type="np",
-            image=init_image, ).images
+            image=init_image,
+        ).images
         assert image.shape == (1, 32, 32, 3)
 
     def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
@@ -198,8 +208,7 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
         )
         init_image = init_image.resize((760, 504))
         model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
-            model_id, safety_checker=None)
+        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         prompt = "A fantasy landscape, trending on artstation"
@@ -210,21 +219,24 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
             strength=0.75,
             guidance_scale=7.5,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images[0]
         image_slice = image[255:258, 383:386, -1]
         assert image.shape == (504, 760, 3)
-        expected_slice = np.array([
-            0.3251649,
-            0.3340174,
-            0.3418343,
-            0.32628638,
-            0.33462793,
-            0.3300547,
-            0.31628466,
-            0.3470268,
-            0.34273332,
-        ])
+        expected_slice = np.array(
+            [
+                0.3251649,
+                0.3340174,
+                0.3418343,
+                0.32628638,
+                0.33462793,
+                0.3300547,
+                0.31628466,
+                0.3470268,
+                0.34273332,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
 
 
@@ -245,8 +257,7 @@ def test_stable_diffusion_img2img_pipeline_default(self):
         #     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
         # )
         model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
-            model_id, safety_checker=None)
+        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         prompt = "A fantasy landscape, trending on artstation"
@@ -257,19 +268,22 @@ def test_stable_diffusion_img2img_pipeline_default(self):
             strength=0.75,
             guidance_scale=7.5,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         assert image.shape == (1, 512, 768, 3)
         image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([
-            0.09987255930900574,
-            0.09875822067260742,
-            0.12803134322166443,
-            0.10067081451416016,
-            0.1142435073852539,
-            0.11815103888511658,
-            0.14216548204421997,
-            0.16465380787849426,
-            0.15393462777137756,
-        ])
+        expected_slice = np.array(
+            [
+                0.09987255930900574,
+                0.09875822067260742,
+                0.12803134322166443,
+                0.10067081451416016,
+                0.1142435073852539,
+                0.11815103888511658,
+                0.14216548204421997,
+                0.16465380787849426,
+                0.15393462777137756,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
index a8426c0ee78a1..e65d01ffc9eb8 100644
--- a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
@@ -19,9 +19,16 @@
 import numpy as np
 import paddle
 
-from ppdiffusers import (AudioDiffusionPipeline, AutoencoderKL, DDIMScheduler,
-                         DDPMScheduler, DiffusionPipeline, Mel,
-                         UNet2DConditionModel, UNet2DModel)
+from ppdiffusers import (
+    AudioDiffusionPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    DiffusionPipeline,
+    Mel,
+    UNet2DConditionModel,
+    UNet2DModel,
+)
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -42,7 +49,8 @@ def dummy_unet(self):
             layers_per_block=2,
             block_out_channels=(128, 128),
             down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"), )
+            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
+        )
         return model
 
     @property
@@ -56,7 +64,8 @@ def dummy_unet_condition(self):
             block_out_channels=(128, 128),
             down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
             up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            cross_attention_dim=10, )
+            cross_attention_dim=10,
+        )
         return model
 
     @property
@@ -70,7 +79,8 @@ def dummy_vqvae_and_unet(self):
             layers_per_block=2,
             block_out_channels=(128, 128),
             down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
-            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), )
+            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
+        )
         unet = UNet2DModel(
             sample_size=(64, 32),
             in_channels=1,
@@ -78,14 +88,14 @@ def dummy_vqvae_and_unet(self):
             layers_per_block=2,
             block_out_channels=(128, 128),
             down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"), )
+            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
+        )
         return vqvae, unet
 
     def test_audio_diffusion(self):
         mel = Mel()
         scheduler = DDPMScheduler()
-        pipe = AudioDiffusionPipeline(
-            vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
+        pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(42)
         output = pipe(generator=generator, steps=4)
@@ -96,55 +106,55 @@ def test_audio_diffusion(self):
         image_from_tuple = output[0][0]
         assert audio.shape == (
             1,
-            (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length, )
-        assert (image.height == self.dummy_unet.config.sample_size[0] and
-                image.width == self.dummy_unet.config.sample_size[1])
+            (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length,
+        )
+        assert (
+            image.height == self.dummy_unet.config.sample_size[0]
+            and image.width == self.dummy_unet.config.sample_size[1]
+        )
         image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        image_from_tuple_slice = np.frombuffer(
-            image_from_tuple.tobytes(), dtype="uint8")[:10]
+        image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
         expected_slice = np.array([0, 252, 0, 160, 144, 1, 0, 211, 99, 3])
         assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) <= 5
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() <= 5
         scheduler = DDIMScheduler()
         dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
         pipe = AudioDiffusionPipeline(
             vqvae=self.dummy_vqvae_and_unet[0],
             unet=dummy_vqvae_and_unet[1],
             mel=mel,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
         pipe.set_progress_bar_config(disable=None)
         np.random.seed(0)
         raw_audio = np.random.uniform(
             -1,
             1,
-            ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) *
-             mel.hop_length, ), )
+            ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) * mel.hop_length,),
+        )
         generator = paddle.Generator().manual_seed(42)
-        output = pipe(
-            raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
+        output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
         image = output.images[0]
         assert (
             image.height == self.dummy_vqvae_and_unet[0].config.sample_size[0]
-            and
-            image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1])
+            and image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1]
+        )
         image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array(
-            [128, 100, 153, 95, 92, 77, 130, 121, 81, 166])
+        expected_slice = np.array([128, 100, 153, 95, 92, 77, 130, 121, 81, 166])
         assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
         dummy_unet_condition = self.dummy_unet_condition
         pipe = AudioDiffusionPipeline(
             vqvae=self.dummy_vqvae_and_unet[0],
             unet=dummy_unet_condition,
             mel=mel,
-            scheduler=scheduler, )
+            scheduler=scheduler,
+        )
         np.random.seed(0)
         encoding = paddle.rand(shape=(1, 1, 10))
         output = pipe(generator=generator, encoding=encoding)
         image = output.images[0]
         image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array(
-            [139, 103, 88, 105, 100, 120, 116, 99, 106, 89])
+        expected_slice = np.array([139, 103, 88, 105, 100, 120, 116, 99, 106, 89])
         assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
 
 
@@ -157,8 +167,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_audio_diffusion(self):
-        pipe = DiffusionPipeline.from_pretrained(
-            "teticio/audio-diffusion-ddim-256")
+        pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(42)
         output = pipe(generator=generator)
@@ -166,10 +175,9 @@ def test_audio_diffusion(self):
         image = output.images[0]
         assert audio.shape == (
             1,
-            (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length, )
-        assert (image.height == pipe.unet.config.sample_size[0] and
-                image.width == pipe.unet.config.sample_size[1])
+            (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length,
+        )
+        assert image.height == pipe.unet.config.sample_size[0] and image.width == pipe.unet.config.sample_size[1]
         image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array(
-            [151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
+        expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
         assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
diff --git a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
index 82c9242a44d2d..c9d67aaf82a83 100644
--- a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
+++ b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
@@ -18,13 +18,22 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.transformers import (ClapTextConfig, ClapTextModelWithProjection,
-                                    RobertaTokenizer, SpeechT5HifiGan,
-                                    SpeechT5HifiGanConfig)
-
-from ppdiffusers import (AudioLDMPipeline, AutoencoderKL, DDIMScheduler,
-                         LMSDiscreteScheduler, PNDMScheduler,
-                         UNet2DConditionModel)
+from paddlenlp.transformers import (
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    RobertaTokenizer,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+
+from ppdiffusers import (
+    AudioLDMPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.training_utils import enable_full_determinism
 from ppdiffusers.utils import require_paddle_gpu, slow
 
@@ -39,16 +48,18 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     params = TEXT_TO_AUDIO_PARAMS
     batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
     test_xformers_attention = False
-    required_optional_params = frozenset([
-        "num_inference_steps",
-        "num_waveforms_per_prompt",
-        "generator",
-        "latents",
-        "output_type",
-        "return_dict",
-        "callback",
-        "callback_steps",
-    ])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_waveforms_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
 
     def get_dummy_components(self):
         paddle.seed(0)
@@ -63,13 +74,15 @@ def get_dummy_components(self):
             cross_attention_dim=(32, 64),
             class_embed_type="simple_projection",
             projection_class_embeddings_input_dim=32,
-            class_embeddings_concat=True, )
+            class_embeddings_concat=True,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -77,7 +90,8 @@ def get_dummy_components(self):
             out_channels=1,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = ClapTextConfig(
             bos_token_id=0,
@@ -89,11 +103,11 @@ def get_dummy_components(self):
             num_hidden_layers=5,
             pad_token_id=1,
             vocab_size=1000,
-            projection_dim=32, )
+            projection_dim=32,
+        )
         text_encoder = ClapTextModelWithProjection(text_encoder_config)
         text_encoder.eval()
-        tokenizer = RobertaTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-roberta", model_max_length=77)
+        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
 
         vocoder_config = SpeechT5HifiGanConfig(
             model_in_dim=8,
@@ -103,7 +117,8 @@ def get_dummy_components(self):
             upsample_kernel_sizes=[4, 4],
             resblock_kernel_sizes=[3, 7],
             resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-            normalize_before=False, )
+            normalize_before=False,
+        )
 
         vocoder = SpeechT5HifiGan(vocoder_config)
         vocoder.eval()
@@ -139,18 +154,20 @@ def test_audioldm_ddim(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([
-            -0.0050,
-            0.0050,
-            -0.0060,
-            0.0033,
-            -0.0026,
-            0.0033,
-            -0.0027,
-            0.0033,
-            -0.0028,
-            0.0033,
-        ])
+        expected_slice = np.array(
+            [
+                -0.0050,
+                0.0050,
+                -0.0060,
+                0.0033,
+                -0.0026,
+                0.0033,
+                -0.0027,
+                0.0033,
+                -0.0028,
+                0.0033,
+            ]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-2
 
@@ -175,10 +192,13 @@ def test_audioldm_prompt_embeds(self):
             max_length=audioldm_pipe.tokenizer.model_max_length,
             return_attention_mask=True,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_inputs = text_inputs["input_ids"].cast("int32")
 
-        prompt_embeds = audioldm_pipe.text_encoder(text_inputs, )
+        prompt_embeds = audioldm_pipe.text_encoder(
+            text_inputs,
+        )
         prompt_embeds = prompt_embeds.text_embeds
         # additional L_2 normalization over each hidden-state
         prompt_embeds = F.normalize(prompt_embeds, axis=-1)
@@ -216,10 +236,13 @@ def test_audioldm_negative_prompt_embeds(self):
                 max_length=audioldm_pipe.tokenizer.model_max_length,
                 truncation=True,
                 return_attention_mask=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_inputs = text_inputs["input_ids"].cast("int32")
 
-            text_embeds = audioldm_pipe.text_encoder(text_inputs, )
+            text_embeds = audioldm_pipe.text_encoder(
+                text_inputs,
+            )
             text_embeds = text_embeds.text_embeds
             # additional L_2 normalization over each hidden-state
             text_embeds = F.normalize(text_embeds, axis=-1)
@@ -249,18 +272,20 @@ def test_audioldm_negative_prompt(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([
-            -0.0051,
-            0.0050,
-            -0.0060,
-            0.0034,
-            -0.0026,
-            0.0033,
-            -0.0027,
-            0.0033,
-            -0.0028,
-            0.0032,
-        ])
+        expected_slice = np.array(
+            [
+                -0.0051,
+                0.0050,
+                -0.0060,
+                0.0034,
+                -0.0026,
+                0.0033,
+                -0.0027,
+                0.0033,
+                -0.0028,
+                0.0032,
+            ]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-2
 
@@ -278,8 +303,7 @@ def test_audioldm_num_waveforms_per_prompt(self):
 
         # test num_waveforms_per_prompt=1 (default) for batch of prompts
         batch_size = 2
-        audios = audioldm_pipe(
-            [prompt] * batch_size, num_inference_steps=2).audios
+        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
 
         assert audios.shape == (batch_size, 256)
 
@@ -288,7 +312,8 @@ def test_audioldm_num_waveforms_per_prompt(self):
         audios = audioldm_pipe(
             prompt,
             num_inference_steps=2,
-            num_waveforms_per_prompt=num_waveforms_per_prompt, ).audios
+            num_waveforms_per_prompt=num_waveforms_per_prompt,
+        ).audios
 
         assert audios.shape == (num_waveforms_per_prompt, 256)
 
@@ -297,7 +322,8 @@ def test_audioldm_num_waveforms_per_prompt(self):
         audios = audioldm_pipe(
             [prompt] * batch_size,
             num_inference_steps=2,
-            num_waveforms_per_prompt=num_waveforms_per_prompt, ).audios
+            num_waveforms_per_prompt=num_waveforms_per_prompt,
+        ).audios
 
         assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
 
@@ -339,12 +365,10 @@ def test_audioldm_vocoder_model_in_dim(self):
         assert audio_shape == (1, 256)
 
     def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(
-            test_mean_pixel_difference=False)
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            test_mean_pixel_difference=False)
+        self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
 
 
 @slow
@@ -380,25 +404,26 @@ def test_audioldm(self):
         assert len(audio) == 81920
 
         audio_slice = audio[77230:77240]
-        expected_slice = np.array([
-            -0.4884,
-            -0.4607,
-            0.0023,
-            0.5007,
-            0.5896,
-            0.5151,
-            0.3813,
-            -0.0208,
-            -0.3687,
-            -0.4315,
-        ])
+        expected_slice = np.array(
+            [
+                -0.4884,
+                -0.4607,
+                0.0023,
+                0.5007,
+                0.5896,
+                0.5151,
+                0.3813,
+                -0.0208,
+                -0.3687,
+                -0.4315,
+            ]
+        )
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 1e-2
 
     def test_audioldm_lms(self):
         audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            audioldm_pipe.scheduler.config)
+        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs()
@@ -408,17 +433,19 @@ def test_audioldm_lms(self):
         assert len(audio) == 81920
 
         audio_slice = audio[27780:27790]
-        expected_slice = np.array([
-            -0.2131,
-            -0.0873,
-            -0.0124,
-            -0.0189,
-            0.0569,
-            0.1373,
-            0.1883,
-            0.2886,
-            0.3297,
-            0.2212,
-        ])
+        expected_slice = np.array(
+            [
+                -0.2131,
+                -0.0873,
+                -0.0124,
+                -0.0189,
+                0.0569,
+                0.1373,
+                0.1883,
+                0.2886,
+                0.3297,
+                0.2212,
+            ]
+        )
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 3e-2
diff --git a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 9b76eed8898ad..b8477a5e775df 100644
--- a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -23,8 +23,10 @@
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS,
-                               UNCONDITIONAL_AUDIO_GENERATION_PARAMS)
+from ..pipeline_params import (
+    UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS,
+    UNCONDITIONAL_AUDIO_GENERATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -55,9 +57,9 @@ def get_dummy_components(self):
             use_timestep_embedding=False,
             time_embedding_type="fourier",
             mid_block_type="UNetMidBlock1D",
-            down_block_types=("DownBlock1DNoSkip", "DownBlock1D",
-                              "AttnDownBlock1D"),
-            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), )
+            down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        )
         scheduler = IPNDMScheduler()
         components = {"unet": unet, "scheduler": scheduler}
         return components
@@ -65,11 +67,7 @@ def get_dummy_components(self):
     def get_dummy_inputs(self, seed=0):
         generator = paddle.Generator().manual_seed(seed)
 
-        inputs = {
-            "batch_size": 1,
-            "generator": generator,
-            "num_inference_steps": 4
-        }
+        inputs = {"batch_size": 1, "generator": generator, "num_inference_steps": 4}
         return inputs
 
     def test_dance_diffusion(self):
@@ -81,8 +79,7 @@ def test_dance_diffusion(self):
         audio = output.audios
         audio_slice = audio[0, -3:, -3:]
         assert audio.shape == (1, 2, components["unet"].sample_size)
-        expected_slice = np.array(
-            [1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0])
+        expected_slice = np.array([1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0])
         assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01
 
 
@@ -98,42 +95,39 @@ def test_dance_diffusion(self):
         pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            generator=generator,
-            num_inference_steps=100,
-            audio_length_in_s=4.096)
+        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
         audio = output.audios
         audio_slice = audio[0, -3:, -3:]
         assert audio.shape == (1, 2, pipe.unet.sample_size)
-        expected_slice = np.array([
-            -0.15758808,
-            -0.15257765,
-            -0.12701476,
-            -0.26994032,
-            -0.27616554,
-            -0.24865153,
-        ])
+        expected_slice = np.array(
+            [
+                -0.15758808,
+                -0.15257765,
+                -0.12701476,
+                -0.26994032,
+                -0.27616554,
+                -0.24865153,
+            ]
+        )
         assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01
 
     def test_dance_diffusion_fp16(self):
-        pipe = DanceDiffusionPipeline.from_pretrained(
-            "harmonai/maestro-150k", paddle_dtype=paddle.float16)
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            generator=generator,
-            num_inference_steps=100,
-            audio_length_in_s=4.096)
+        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
         audio = output.audios
         audio_slice = audio[0, -3:, -3:]
         assert audio.shape == (1, 2, pipe.unet.sample_size)
         # scheduler use fp32
-        expected_slice = np.array([
-            -0.15350387,
-            -0.14624646,
-            -0.12091318,
-            -0.25969276,
-            -0.26154587,
-            -0.23359495,
-        ])
+        expected_slice = np.array(
+            [
+                -0.15350387,
+                -0.14624646,
+                -0.12091318,
+                -0.25969276,
+                -0.26154587,
+                -0.23359495,
+            ]
+        )
         assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/ddim/test_ddim.py b/ppdiffusers/tests/pipelines/ddim/test_ddim.py
index c2fb14bc1020a..92f66001a03f4 100644
--- a/ppdiffusers/tests/pipelines/ddim/test_ddim.py
+++ b/ppdiffusers/tests/pipelines/ddim/test_ddim.py
@@ -21,8 +21,10 @@
 from ppdiffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
 from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
 
-from ..pipeline_params import (UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS,
-                               UNCONDITIONAL_IMAGE_GENERATION_PARAMS)
+from ..pipeline_params import (
+    UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS,
+    UNCONDITIONAL_IMAGE_GENERATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -47,7 +49,8 @@ def get_dummy_components(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         scheduler = DDIMScheduler()
         components = {"unet": unet, "scheduler": scheduler}
         return components
@@ -71,17 +74,19 @@ def test_inference(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         self.assertEqual(image.shape, (1, 32, 32, 3))
-        expected_slice = np.array([
-            0.0,
-            0.00152004,
-            0.0,
-            0.0,
-            0.00860906,
-            0.00182715,
-            0.00189051,
-            1.0,
-            0.668702,
-        ])
+        expected_slice = np.array(
+            [
+                0.0,
+                0.00152004,
+                0.0,
+                0.0,
+                0.00860906,
+                0.00182715,
+                0.00189051,
+                1.0,
+                0.668702,
+            ]
+        )
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 0.001)
 
@@ -99,10 +104,7 @@ def test_inference_cifar10(self):
         image = ddim(generator=generator, eta=0.0, output_type="numpy").images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446,
-            0.2388
-        ])
+        expected_slice = np.array([0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446, 0.2388])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_inference_ema_bedroom(self):
@@ -115,15 +117,17 @@ def test_inference_ema_bedroom(self):
         image = ddim(generator=generator, output_type="numpy").images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([
-            0.19830778,
-            0.18826014,
-            0.18584034,
-            0.1927332,
-            0.18754855,
-            0.17855307,
-            0.18288234,
-            0.16375086,
-            0.1497818,
-        ])
+        expected_slice = np.array(
+            [
+                0.19830778,
+                0.18826014,
+                0.18584034,
+                0.1927332,
+                0.18754855,
+                0.17855307,
+                0.18288234,
+                0.16375086,
+                0.1497818,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
index 988129c546625..f2d25b2e39403 100644
--- a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
+++ b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     def test_fast_inference(self):
@@ -42,33 +43,33 @@ def test_fast_inference(self):
         ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
         ddpm.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = ddpm(
-            generator=generator, num_inference_steps=2,
-            output_type="numpy").images
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = ddpm(
             generator=generator,
             num_inference_steps=2,
             output_type="numpy",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.007474243640899658,
-            0.0,
-            0.007990598678588867,
-            0.9972629547119141,
-            0.6665917634963989,
-        ])
+        expected_slice = np.array(
+            [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.007474243640899658,
+                0.0,
+                0.007990598678588867,
+                0.9972629547119141,
+                0.6665917634963989,
+            ]
+        )
         print(image_slice.flatten().tolist())
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_inference_predict_sample(self):
         unet = self.dummy_uncond_unet
@@ -76,18 +77,14 @@ def test_inference_predict_sample(self):
         ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
         ddpm.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = ddpm(
-            generator=generator, num_inference_steps=2,
-            output_type="numpy").images
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
-        image_eps = ddpm(
-            generator=generator, num_inference_steps=2, output_type="numpy")[0]
+        image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0]
         image_slice = image[0, -3:, -3:, -1]
         image_eps_slice = image_eps[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
         tolerance = 0.01
-        assert (np.abs(image_slice.flatten() - image_eps_slice.flatten()).max()
-                < tolerance)
+        assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
 
 
 @slow
@@ -103,8 +100,5 @@ def test_inference_cifar10(self):
         image = ddpm(generator=generator, output_type="numpy").images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604,
-            0.2020
-        ])
+        expected_slice = np.array([0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604, 0.2020])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
index 9f3a881a35c78..acb9a8a602116 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
@@ -30,13 +30,11 @@
 class IFPipelineTesterMixin:
     def _get_dummy_components(self):
         paddle.seed(0)
-        text_encoder = T5EncoderModel.from_pretrained(
-            "hf-internal-testing/tiny-random-t5")
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
         text_encoder.eval()
 
         paddle.seed(0)
-        tokenizer = AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
 
         paddle.seed(0)
         unet = UNet2DConditionModel(
@@ -48,9 +46,7 @@ def _get_dummy_components(self):
                 "SimpleCrossAttnDownBlock2D",
             ],
             mid_block_type="UNetMidBlock2DSimpleCrossAttn",
-            up_block_types=[
-                "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"
-            ],
+            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
             in_channels=3,
             out_channels=6,
             cross_attention_dim=32,
@@ -60,9 +56,9 @@ def _get_dummy_components(self):
             addition_embed_type_num_heads=2,
             cross_attention_norm="group_norm",
             resnet_time_scale_shift="scale_shift",
-            act_fn="gelu", )
-        unet.set_attn_processor(
-            AttnAddedKVProcessor())  # For reproducibility tests
+            act_fn="gelu",
+        )
+        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
 
         paddle.seed(0)
         scheduler = DDPMScheduler(
@@ -74,7 +70,8 @@ def _get_dummy_components(self):
             dynamic_thresholding_ratio=0.95,
             sample_max_value=1.0,
             prediction_type="epsilon",
-            variance_type="learned_range", )
+            variance_type="learned_range",
+        )
 
         paddle.seed(0)
         watermarker = IFWatermarker()
@@ -91,13 +88,11 @@ def _get_dummy_components(self):
 
     def _get_superresolution_dummy_components(self):
         paddle.seed(0)
-        text_encoder = T5EncoderModel.from_pretrained(
-            "hf-internal-testing/tiny-random-t5")
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
         text_encoder.eval()
 
         paddle.seed(0)
-        tokenizer = AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
 
         paddle.seed(0)
         unet = UNet2DConditionModel(
@@ -109,9 +104,7 @@ def _get_superresolution_dummy_components(self):
                 "SimpleCrossAttnDownBlock2D",
             ],
             mid_block_type="UNetMidBlock2DSimpleCrossAttn",
-            up_block_types=[
-                "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"
-            ],
+            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
             in_channels=6,
             out_channels=6,
             cross_attention_dim=32,
@@ -125,9 +118,9 @@ def _get_superresolution_dummy_components(self):
             class_embed_type="timestep",
             mid_block_scale_factor=1.414,
             time_embedding_act_fn="gelu",
-            time_embedding_dim=32, )
-        unet.set_attn_processor(
-            AttnAddedKVProcessor())  # For reproducibility tests
+            time_embedding_dim=32,
+        )
+        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
 
         paddle.seed(0)
         scheduler = DDPMScheduler(
@@ -139,14 +132,16 @@ def _get_superresolution_dummy_components(self):
             dynamic_thresholding_ratio=0.95,
             sample_max_value=1.0,
             prediction_type="epsilon",
-            variance_type="learned_range", )
+            variance_type="learned_range",
+        )
 
         paddle.seed(0)
         image_noising_scheduler = DDPMScheduler(
             num_train_timesteps=1000,
             beta_schedule="squaredcos_cap_v2",
             beta_start=0.0001,
-            beta_end=0.02, )
+            beta_end=0.02,
+        )
 
         paddle.seed(0)
         watermarker = IFWatermarker()
@@ -226,8 +221,7 @@ def _test_save_load_optional_components(self):
             pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
             pipe_loaded.set_progress_bar_config(disable=None)
 
-        pipe_loaded.unet.set_attn_processor(
-            AttnAddedKVProcessor())  # For reproducibility tests
+        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
 
         for optional_component in pipe._optional_components:
             self.assertTrue(
@@ -278,8 +272,7 @@ def _test_save_load_local(self):
             pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
             pipe_loaded.set_progress_bar_config(disable=None)
 
-        pipe_loaded.unet.set_attn_processor(
-            AttnAddedKVProcessor())  # For reproducibility tests
+        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
 
         inputs = self.get_dummy_inputs()
         output_loaded = pipe_loaded(**inputs)[0]
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
index f5daacd7abdcb..4192ea593d45d 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
@@ -19,26 +19,31 @@
 import paddle
 
 from ppdiffusers import (
-    IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline,
-    IFInpaintingSuperResolutionPipeline, IFPipeline, IFSuperResolutionPipeline)
+    IFImg2ImgPipeline,
+    IFImg2ImgSuperResolutionPipeline,
+    IFInpaintingPipeline,
+    IFInpaintingSuperResolutionPipeline,
+    IFPipeline,
+    IFSuperResolutionPipeline,
+)
 from ppdiffusers.models.attention_processor import AttnAddedKVProcessor
-from ppdiffusers.utils.testing_utils import (floats_tensor, load_numpy,
-                                             require_paddle_gpu, slow)
+from ppdiffusers.utils.testing_utils import (
+    floats_tensor,
+    load_numpy,
+    require_paddle_gpu,
+    slow,
+)
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (PipelineTesterMixin,
-                                     assert_mean_pixel_difference)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 from . import IFPipelineTesterMixin
 
 
-class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,
-                          unittest.TestCase):
+class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
     pipeline_class = IFPipeline
     params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"}
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
 
     def get_dummy_components(self):
         return self._get_dummy_components()
@@ -69,11 +74,12 @@ def test_save_load_local(self):
         self._test_save_load_local()
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
 
 @slow
@@ -88,24 +94,21 @@ def tearDown(self):
     def test_all(self):
         # if
 
-        pipe_1 = IFPipeline.from_pretrained(
-            "DeepFloyd/IF-I-XL-v1.0",
-            variant="fp16",
-            paddle_dtype=paddle.float16)
+        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
 
         pipe_2 = IFSuperResolutionPipeline.from_pretrained(
             "DeepFloyd/IF-II-L-v1.0",
             variant="fp16",
             paddle_dtype=paddle.float16,
             text_encoder=None,
-            tokenizer=None, )
+            tokenizer=None,
+        )
 
         # pre compute text embeddings and remove T5 to save memory
 
         pipe_1.text_encoder
 
-        prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt(
-            "anime turtle")
+        prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle")
 
         del pipe_1.tokenizer
         del pipe_1.text_encoder
@@ -136,8 +139,7 @@ def test_all(self):
         pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
         pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
 
-        self._test_if_img2img(pipe_1, pipe_2, prompt_embeds,
-                              negative_prompt_embeds)
+        self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
 
         pipe_1.remove_all_hooks()
         pipe_2.remove_all_hooks()
@@ -153,8 +155,7 @@ def test_all(self):
         pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
         pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
 
-        self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds,
-                                 negative_prompt_embeds)
+        self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
 
     def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
         # pipeline 1
@@ -165,7 +166,8 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
             negative_prompt_embeds=negative_prompt_embeds,
             num_inference_steps=2,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
 
         image = output.images[0]
 
@@ -191,7 +193,8 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
             image=image,
             generator=generator,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
 
         image = output.images[0]
 
@@ -205,8 +208,7 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
         )
         assert_mean_pixel_difference(image, expected_image)
 
-    def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
-                         negative_prompt_embeds):
+    def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
         # pipeline 1
 
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
@@ -219,7 +221,8 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
             image=image,
             num_inference_steps=2,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
 
         image = output.images[0]
 
@@ -247,7 +250,8 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
             original_image=original_image,
             generator=generator,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
 
         image = output.images[0]
 
@@ -261,8 +265,7 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds,
         )
         assert_mean_pixel_difference(image, expected_image)
 
-    def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds,
-                            negative_prompt_embeds):
+    def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
         # pipeline 1
 
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
@@ -276,7 +279,8 @@ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds,
             mask_image=mask_image,
             num_inference_steps=2,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
 
         image = output.images[0]
 
@@ -306,7 +310,8 @@ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds,
             original_image=original_image,
             generator=generator,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
 
         image = output.images[0]
 
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
index 3fce4eab7164b..bab44fc4a5cbf 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -20,20 +20,19 @@
 from ppdiffusers import IFImg2ImgPipeline
 from ppdiffusers.utils import floats_tensor
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 from . import IFPipelineTesterMixin
 
 
-class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,
-                                 unittest.TestCase):
+class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
     pipeline_class = IFImg2ImgPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
 
     def get_dummy_components(self):
         return self._get_dummy_components()
@@ -58,8 +57,7 @@ def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
     def test_save_load_float16(self):
         # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
@@ -75,4 +73,6 @@ def test_save_load_local(self):
         self._test_save_load_local()
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
index effd8aec47da6..0d977c5d6f2ee 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -20,21 +20,19 @@
 from ppdiffusers import IFImg2ImgSuperResolutionPipeline
 from ppdiffusers.utils import floats_tensor
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 from . import IFPipelineTesterMixin
 
 
-class IFImg2ImgSuperResolutionPipelineFastTests(
-        PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
     pipeline_class = IFImg2ImgSuperResolutionPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union(
-        {"original_image"})
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
 
     def get_dummy_components(self):
         return self._get_superresolution_dummy_components()
@@ -58,8 +56,7 @@ def get_dummy_inputs(self, seed=0):
         return inputs
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
     def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
@@ -75,4 +72,6 @@ def test_save_load_local(self):
         self._test_save_load_local()
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
index 803ebffdb1ad5..e46b7c5ebea69 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -20,20 +20,19 @@
 from ppdiffusers import IFInpaintingPipeline
 from ppdiffusers.utils import floats_tensor
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 from . import IFPipelineTesterMixin
 
 
-class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,
-                                    unittest.TestCase):
+class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
     pipeline_class = IFInpaintingPipeline
     params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
     batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
 
     def get_dummy_components(self):
         return self._get_dummy_components()
@@ -57,8 +56,7 @@ def get_dummy_inputs(self, seed=0):
         return inputs
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
     def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
@@ -74,4 +72,6 @@ def test_save_load_local(self):
         self._test_save_load_local()
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
index 0f24c066122e2..d50852284146e 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -20,21 +20,19 @@
 from ppdiffusers import IFInpaintingSuperResolutionPipeline
 from ppdiffusers.utils import floats_tensor
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 from . import IFPipelineTesterMixin
 
 
-class IFInpaintingSuperResolutionPipelineFastTests(
-        PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
     pipeline_class = IFInpaintingSuperResolutionPipeline
     params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union(
-        {"original_image"})
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
 
     def get_dummy_components(self):
         return self._get_superresolution_dummy_components()
@@ -60,8 +58,7 @@ def get_dummy_inputs(self, seed=0):
         return inputs
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
     def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
@@ -77,4 +74,6 @@ def test_save_load_local(self):
         self._test_save_load_local()
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
index ae1810b58f991..79a7319b80757 100644
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -20,20 +20,19 @@
 from ppdiffusers import IFSuperResolutionPipeline
 from ppdiffusers.utils import floats_tensor
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 from . import IFPipelineTesterMixin
 
 
-class IFSuperResolutionPipelineFastTests(
-        PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
     pipeline_class = IFSuperResolutionPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
 
     def get_dummy_components(self):
         return self._get_superresolution_dummy_components()
@@ -55,8 +54,7 @@ def get_dummy_inputs(self, seed=0):
         return inputs
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
     def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
@@ -72,4 +70,6 @@ def test_save_load_local(self):
         self._test_save_load_local()
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=1e-2, )
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/ppdiffusers/tests/pipelines/dit/test_dit.py b/ppdiffusers/tests/pipelines/dit/test_dit.py
index ffbe5d6d4dc33..c9d17607fcbd0 100644
--- a/ppdiffusers/tests/pipelines/dit/test_dit.py
+++ b/ppdiffusers/tests/pipelines/dit/test_dit.py
@@ -19,13 +19,20 @@
 import numpy as np
 import paddle
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiTPipeline,
-                         DPMSolverMultistepScheduler, Transformer2DModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiTPipeline,
+    DPMSolverMultistepScheduler,
+    Transformer2DModel,
+)
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
-                               CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS)
+from ..pipeline_params import (
+    CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
+    CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -55,7 +62,8 @@ def get_dummy_components(self):
             activation_fn="gelu-approximate",
             num_embeds_ada_norm=1000,
             norm_type="ada_norm_zero",
-            norm_elementwise_affine=False, )
+            norm_elementwise_affine=False,
+        )
         vae = AutoencoderKL()
         scheduler = DDIMScheduler()
         components = {
@@ -85,20 +93,15 @@ def test_inference(self):
         image_slice = image[0, -3:, -3:, -1]
         self.assertEqual(image.shape, (1, 16, 16, 3))
         print(image_slice.flatten())
-        expected_slice = np.array([
-            0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0,
-            0.14398015
-        ])
+        expected_slice = np.array([0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0, 0.14398015])
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 0.001)
 
     def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            relax_max_difference=True, expected_max_diff=1e-3)
+        self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3)
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-3)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
 
 
 @require_paddle_gpu
@@ -116,35 +119,35 @@ def test_dit_256(self):
 
         words = ["vase", "umbrella", "white shark", "white wolf"]
         ids = pipe.get_label_ids(words)
-        images = pipe(
-            ids, generator=generator, num_inference_steps=40,
-            output_type="np").images
-        expected_slices = np.array([
-            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-            [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0],
+        images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images
+        expected_slices = np.array(
             [
-                0.434637188911438,
-                0.4323567748069763,
-                0.4406988322734833,
-                0.442973256111145,
-                0.4462621212005615,
-                0.45129328966140747,
-                0.41893237829208374,
-                0.42390328645706177,
-                0.3906112015247345,
-            ],
-            [
-                0.9986965656280518,
-                0.9948190450668335,
-                0.9841029644012451,
-                0.9911775588989258,
-                0.9871039390563965,
-                0.9874314069747925,
-                0.9822297096252441,
-                0.9997426271438599,
-                1.0,
-            ],
-        ])
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0],
+                [
+                    0.434637188911438,
+                    0.4323567748069763,
+                    0.4406988322734833,
+                    0.442973256111145,
+                    0.4462621212005615,
+                    0.45129328966140747,
+                    0.41893237829208374,
+                    0.42390328645706177,
+                    0.3906112015247345,
+                ],
+                [
+                    0.9986965656280518,
+                    0.9948190450668335,
+                    0.9841029644012451,
+                    0.9911775588989258,
+                    0.9871039390563965,
+                    0.9874314069747925,
+                    0.9822297096252441,
+                    0.9997426271438599,
+                    1.0,
+                ],
+            ]
+        )
 
         for word, image, expected_slice in zip(words, images, expected_slices):
             # expected_image = load_numpy(
@@ -152,37 +155,34 @@ def test_dit_256(self):
             # )
             assert image.shape == (256, 256, 3)
             image_slice = image[-3:, -3:, -1]
-            assert np.abs((image_slice.flatten() - expected_slice).max(
-            )) < 0.001
+            assert np.abs((image_slice.flatten() - expected_slice).max()) < 0.001
 
     def test_dit_512_fp16(self):
-        pipe = DiTPipeline.from_pretrained(
-            "facebook/DiT-XL-2-512", paddle_dtype=paddle.float16)
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            pipe.scheduler.config)
+        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512", paddle_dtype=paddle.float16)
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
         pipe.to("gpu")
 
         words = ["vase", "umbrella"]
         ids = pipe.get_label_ids(words)
         generator = paddle.Generator().manual_seed(0)
-        images = pipe(
-            ids, generator=generator, num_inference_steps=25,
-            output_type="np").images
+        images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images
 
-        expected_slices = np.array([
-            [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625],
+        expected_slices = np.array(
             [
-                0.0,
-                0.0,
-                0.01708984375,
-                0.024658203125,
-                0.0830078125,
-                0.134521484375,
-                0.175537109375,
-                0.33740234375,
-                0.207763671875,
-            ],
-        ])
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625],
+                [
+                    0.0,
+                    0.0,
+                    0.01708984375,
+                    0.024658203125,
+                    0.0830078125,
+                    0.134521484375,
+                    0.175537109375,
+                    0.33740234375,
+                    0.207763671875,
+                ],
+            ]
+        )
 
         for word, image, expected_slice in zip(words, images, expected_slices):
             # expected_image = load_numpy(
diff --git a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
index aff5775323867..da80059ddfdc4 100644
--- a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
+++ b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     def test_inference(self):
@@ -42,22 +43,20 @@ def test_inference(self):
         pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            num_inference_steps=2, generator=generator,
-            output_type="numpy").images
+        image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = pipe(
             num_inference_steps=2,
             generator=generator,
             output_type="numpy",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
         expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
 
 @slow
@@ -70,20 +69,20 @@ def test_inference(self):
         pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            num_inference_steps=20, generator=generator,
-            output_type="numpy").images
+        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([
-            0.7528239,
-            0.7529462,
-            0.76014197,
-            0.75482357,
-            0.75692874,
-            0.7577723,
-            0.760527,
-            0.758951,
-            0.7599246,
-        ])
+        expected_slice = np.array(
+            [
+                0.7528239,
+                0.7529462,
+                0.76014197,
+                0.75482357,
+                0.75692874,
+                0.7577723,
+                0.760527,
+                0.758951,
+                0.7599246,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index 93583e8814480..3bdb01281a103 100644
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -20,10 +20,18 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline,
-                         UNet2DConditionModel)
-from ppdiffusers.utils.testing_utils import (load_numpy, nightly,
-                                             require_paddle_gpu, slow)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LDMTextToImagePipeline,
+    UNet2DConditionModel,
+)
+from ppdiffusers.utils.testing_utils import (
+    load_numpy,
+    nightly,
+    require_paddle_gpu,
+    slow,
+)
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
@@ -55,13 +63,15 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=(32, 64),
@@ -69,7 +79,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
             up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -80,10 +91,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -113,17 +124,19 @@ def test_inference_text2img(self):
         image = pipe(**inputs).images
         assert image.shape == (1, 64, 64, 3)
         image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([
-            0.28524342,
-            0.23806289,
-            0.38151595,
-            0.21939021,
-            0.26112252,
-            0.5172909,
-            0.25647423,
-            0.25049314,
-            0.47979864,
-        ])
+        expected_slice = np.array(
+            [
+                0.28524342,
+                0.23806289,
+                0.38151595,
+                0.21939021,
+                0.26112252,
+                0.5172909,
+                0.25647423,
+                0.25049314,
+                0.47979864,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
 
@@ -150,24 +163,25 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_ldm_default_ddim(self):
-        pipe = LDMTextToImagePipeline.from_pretrained(
-            "CompVis/ldm-text2im-large-256")
+        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([
-            0.51825,
-            0.5285,
-            0.52543,
-            0.54258,
-            0.52304,
-            0.52569,
-            0.54363,
-            0.55276,
-            0.56878,
-        ])
+        expected_slice = np.array(
+            [
+                0.51825,
+                0.5285,
+                0.52543,
+                0.54258,
+                0.52304,
+                0.52569,
+                0.54363,
+                0.55276,
+                0.56878,
+            ]
+        )
         max_diff = np.abs(expected_slice - image_slice).max()
         assert max_diff < 0.02
 
@@ -195,8 +209,7 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_ldm_default_ddim(self):
-        pipe = LDMTextToImagePipeline.from_pretrained(
-            "CompVis/ldm-text2im-large-256")
+        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images[0]
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index 32472986acf44..aea2e7538f903 100644
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -19,10 +19,8 @@
 import numpy as np
 import paddle
 
-from ppdiffusers import (DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel,
-                         VQModel)
-from ppdiffusers.utils import (PIL_INTERPOLATION, floats_tensor, load_image,
-                               slow)
+from ppdiffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
+from ppdiffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle
 
 
@@ -32,8 +30,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = 32, 32
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     @property
@@ -46,7 +43,8 @@ def dummy_uncond_unet(self):
             in_channels=6,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     @property
@@ -58,15 +56,15 @@ def dummy_vq_model(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3, )
+            latent_channels=3,
+        )
         return model
 
     def test_inference_superresolution(self):
         unet = self.dummy_uncond_unet
         scheduler = DDIMScheduler()
         vqvae = self.dummy_vq_model
-        ldm = LDMSuperResolutionPipeline(
-            unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
         ldm.set_progress_bar_config(disable=None)
         init_image = self.dummy_image
         generator = paddle.Generator().manual_seed(0)
@@ -74,20 +72,23 @@ def test_inference_superresolution(self):
             image=init_image,
             generator=generator,
             num_inference_steps=2,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.12982202,
-            0.8338444,
-            0.46506804,
-            0.5459576,
-            0.6662215,
-            0.38444045,
-            0.72195464,
-            0.5719301,
-            0.36579454,
-        ])
+        expected_slice = np.array(
+            [
+                0.12982202,
+                0.8338444,
+                0.46506804,
+                0.5459576,
+                0.6662215,
+                0.38444045,
+                0.72195464,
+                0.5719301,
+                0.36579454,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_inference_superresolution_fp16(self):
@@ -96,12 +97,10 @@ def test_inference_superresolution_fp16(self):
         vqvae = self.dummy_vq_model
         unet = unet.to(dtype=paddle.float16)
         vqvae = vqvae.to(dtype=paddle.float16)
-        ldm = LDMSuperResolutionPipeline(
-            unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
         ldm.set_progress_bar_config(disable=None)
         init_image = self.dummy_image
-        image = ldm(init_image, num_inference_steps=2,
-                    output_type="numpy").images
+        image = ldm(init_image, num_inference_steps=2, output_type="numpy").images
         assert image.shape == (1, 64, 64, 3)
 
 
@@ -112,21 +111,17 @@ def test_inference_superresolution(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool.png"
         )
-        init_image = init_image.resize(
-            (64, 64), resample=PIL_INTERPOLATION["lanczos"])
-        ldm = LDMSuperResolutionPipeline.from_pretrained(
-            "duongna/ldm-super-resolution")
+        init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
+        ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution")
         ldm.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
         image = ldm(
             image=init_image,
             generator=generator,
             num_inference_steps=20,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([
-            0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257,
-            0.6907
-        ])
+        expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257, 0.6907])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
index 5ad34d0481b67..89319ee92bcb2 100644
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
+++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
@@ -34,7 +34,8 @@ def dummy_uncond_unet(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     @property
@@ -46,7 +47,8 @@ def dummy_vq_model(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3, )
+            latent_channels=3,
+        )
         return model
 
     @property
@@ -61,7 +63,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModel(config).eval()
 
     def test_inference_uncond(self):
@@ -71,33 +74,33 @@ def test_inference_uncond(self):
         ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
         ldm.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = ldm(generator=generator,
-                    num_inference_steps=2,
-                    output_type="numpy").images
+        image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = ldm(
             generator=generator,
             num_inference_steps=2,
             output_type="numpy",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.827049,
-            1.0,
-            0.6244688,
-            0.7729403,
-            1.0,
-            0.73071766,
-            0.6108738,
-            0.9107263,
-            0.7249622,
-        ])
+        expected_slice = np.array(
+            [
+                0.827049,
+                1.0,
+                0.6244688,
+                0.7729403,
+                1.0,
+                0.73071766,
+                0.6108738,
+                0.9107263,
+                0.7249622,
+            ]
+        )
         tolerance = 0.01
         assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert (np.abs(image_from_tuple_slice.flatten() - expected_slice).max()
-                < tolerance)
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
 
 
 @slow
@@ -107,21 +110,21 @@ def test_inference_uncond(self):
         ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
         ldm.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = ldm(generator=generator,
-                    num_inference_steps=5,
-                    output_type="numpy").images
+        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([
-            0.59802866,
-            0.61698544,
-            0.62753576,
-            0.6128236,
-            0.60961217,
-            0.617262,
-            0.6060791,
-            0.60261935,
-            0.6129079,
-        ])
+        expected_slice = np.array(
+            [
+                0.59802866,
+                0.61698544,
+                0.62753576,
+                0.6128236,
+                0.60961217,
+                0.617262,
+                0.6060791,
+                0.60261935,
+                0.6129079,
+            ]
+        )
         tolerance = 0.01
         assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
diff --git a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
index 35c1718941567..00025bde5002d 100644
--- a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -22,14 +22,20 @@
 from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, PaintByExamplePipeline, PNDMScheduler,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    PaintByExamplePipeline,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
 from ppdiffusers.utils import floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-                               IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+    IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -48,7 +54,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -57,7 +64,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         config = CLIPVisionConfig(
             hidden_size=32,
@@ -67,7 +75,8 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             image_size=32,
-            patch_size=4, )
+            patch_size=4,
+        )
         image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
         feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
         components = {
@@ -93,13 +102,9 @@ def test_save_load_float16(self):
     def get_dummy_inputs(self, seed=0):
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
         image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (64, 64))
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (64, 64)))
-        example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (32, 32))
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
+        example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
         generator = paddle.Generator().manual_seed(seed)
 
         inputs = {
@@ -122,17 +127,19 @@ def test_paint_by_example_inpaint(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.82595694,
-            0.51862055,
-            0.5474039,
-            0.2411496,
-            0.20220888,
-            0.3430622,
-            0.3558151,
-            0.06606945,
-            0.4550809,
-        ])
+        expected_slice = np.array(
+            [
+                0.82595694,
+                0.51862055,
+                0.5474039,
+                0.2411496,
+                0.20220888,
+                0.3430622,
+                0.3558151,
+                0.06606945,
+                0.4550809,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_paint_by_example_image_tensor(self):
@@ -172,8 +179,7 @@ def test_paint_by_example(self):
         example_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/paint_by_example/panda.jpg"
         )
-        pipe = PaintByExamplePipeline.from_pretrained(
-            "Fantasy-Studio/Paint-by-Example")
+        pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(seed=321)
         output = pipe(
@@ -183,12 +189,10 @@ def test_paint_by_example(self):
             generator=generator,
             guidance_scale=5.0,
             num_inference_steps=50,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529,
-            0.5374
-        ])
+        expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529, 0.5374])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
diff --git a/ppdiffusers/tests/pipelines/pipeline_params.py b/ppdiffusers/tests/pipelines/pipeline_params.py
index 33b041a173248..9f835e6e783cc 100644
--- a/ppdiffusers/tests/pipelines/pipeline_params.py
+++ b/ppdiffusers/tests/pipelines/pipeline_params.py
@@ -22,80 +22,89 @@
 # I.e. a text to image pipeline with non-configurable height and width arguments
 # should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
 
-TEXT_TO_IMAGE_PARAMS = frozenset([
-    "prompt",
-    "height",
-    "width",
-    "guidance_scale",
-    "negative_prompt",
-    "prompt_embeds",
-    "negative_prompt_embeds",
-    "cross_attention_kwargs",
-])
+TEXT_TO_IMAGE_PARAMS = frozenset(
+    [
+        "prompt",
+        "height",
+        "width",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "cross_attention_kwargs",
+    ]
+)
 
 TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
 
-IMAGE_VARIATION_PARAMS = frozenset([
-    "image",
-    "height",
-    "width",
-    "guidance_scale",
-])
+IMAGE_VARIATION_PARAMS = frozenset(
+    [
+        "image",
+        "height",
+        "width",
+        "guidance_scale",
+    ]
+)
 
 IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
 
-TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset([
-    "prompt",
-    "image",
-    "height",
-    "width",
-    "guidance_scale",
-    "negative_prompt",
-    "prompt_embeds",
-    "negative_prompt_embeds",
-])
-
-TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(
-    ["prompt", "image", "negative_prompt"])
-
-TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset([
-    # Text guided image variation with an image mask
-    "prompt",
-    "image",
-    "mask_image",
-    "height",
-    "width",
-    "guidance_scale",
-    "negative_prompt",
-    "prompt_embeds",
-    "negative_prompt_embeds",
-])
-
-TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(
-    ["prompt", "image", "mask_image", "negative_prompt"])
-
-IMAGE_INPAINTING_PARAMS = frozenset([
-    # image variation with an image mask
-    "image",
-    "mask_image",
-    "height",
-    "width",
-    "guidance_scale",
-])
+TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
+    [
+        "prompt",
+        "image",
+        "height",
+        "width",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+)
+
+TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
+
+TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
+    [
+        # Text guided image variation with an image mask
+        "prompt",
+        "image",
+        "mask_image",
+        "height",
+        "width",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+)
+
+TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
+
+IMAGE_INPAINTING_PARAMS = frozenset(
+    [
+        # image variation with an image mask
+        "image",
+        "mask_image",
+        "height",
+        "width",
+        "guidance_scale",
+    ]
+)
 
 IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
 
-IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset([
-    "example_image",
-    "image",
-    "mask_image",
-    "height",
-    "width",
-    "guidance_scale",
-])
+IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
+    [
+        "example_image",
+        "image",
+        "mask_image",
+        "height",
+        "width",
+        "guidance_scale",
+    ]
+)
 
-IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(
-    ["example_image", "image", "mask_image"])
+IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
 
 CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
 
@@ -109,15 +118,17 @@
 
 UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
 
-TEXT_TO_AUDIO_PARAMS = frozenset([
-    "prompt",
-    "audio_length_in_s",
-    "guidance_scale",
-    "negative_prompt",
-    "prompt_embeds",
-    "negative_prompt_embeds",
-    "cross_attention_kwargs",
-])
+TEXT_TO_AUDIO_PARAMS = frozenset(
+    [
+        "prompt",
+        "audio_length_in_s",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "cross_attention_kwargs",
+    ]
+)
 
 TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
 TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
diff --git a/ppdiffusers/tests/pipelines/pndm/test_pndm.py b/ppdiffusers/tests/pipelines/pndm/test_pndm.py
index 2255f43742f71..bfa6285a45d5f 100644
--- a/ppdiffusers/tests/pipelines/pndm/test_pndm.py
+++ b/ppdiffusers/tests/pipelines/pndm/test_pndm.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     def test_inference(self):
@@ -42,22 +43,20 @@ def test_inference(self):
         pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
         pndm.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = pndm(
-            generator=generator, num_inference_steps=20,
-            output_type="numpy").images
+        image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = pndm(
             generator=generator,
             num_inference_steps=20,
             output_type="numpy",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
         expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
 
 @slow
@@ -73,15 +72,17 @@ def test_inference_cifar10(self):
         image = pndm(generator=generator, output_type="numpy").images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.15949559211730957,
-            0.17172572016716003,
-            0.17315810918807983,
-            0.1836635172367096,
-            0.1823960244655609,
-            0.1799020767211914,
-            0.21776044368743896,
-            0.22992581129074097,
-            0.21678516268730164,
-        ])
+        expected_slice = np.array(
+            [
+                0.15949559211730957,
+                0.17172572016716003,
+                0.17315810918807983,
+                0.1836635172367096,
+                0.1823960244655609,
+                0.1799020767211914,
+                0.21776044368743896,
+                0.22992581129074097,
+                0.21678516268730164,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/repaint/test_repaint.py b/ppdiffusers/tests/pipelines/repaint/test_repaint.py
index 3bce3769af1be..9d27e3b1c5061 100644
--- a/ppdiffusers/tests/pipelines/repaint/test_repaint.py
+++ b/ppdiffusers/tests/pipelines/repaint/test_repaint.py
@@ -20,11 +20,14 @@
 import paddle
 
 from ppdiffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import (load_image, load_numpy, nightly,
-                                             require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import (
+    load_image,
+    load_numpy,
+    nightly,
+    require_paddle_gpu,
+)
 
-from ..pipeline_params import (IMAGE_INPAINTING_BATCH_PARAMS,
-                               IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -49,7 +52,8 @@ def get_dummy_components(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         scheduler = RePaintScheduler()
         components = {"unet": unet, "scheduler": scheduler}
         return components
@@ -80,17 +84,19 @@ def test_repaint(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.08341709,
-            0.54262626,
-            0.549711,
-            0.00903523,
-            0.0,
-            1.0,
-            0.05136755,
-            0.5604646,
-            0.6273578,
-        ])
+        expected_slice = np.array(
+            [
+                0.08341709,
+                0.54262626,
+                0.549711,
+                0.00903523,
+                0.0,
+                1.0,
+                0.05136755,
+                0.5604646,
+                0.6273578,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     # RePaint can hardly be made deterministic since the scheduler is currently always
@@ -133,7 +139,8 @@ def test_celebahq(self):
             jump_length=10,
             jump_n_sample=10,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images[0]
         assert image.shape == (256, 256, 3)
         assert np.abs(expected_image - image).mean() < 0.01
diff --git a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
index f3b799000aa41..97af9d23e974c 100644
--- a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -33,7 +33,8 @@ def dummy_uncond_unet(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     def test_inference(self):
@@ -42,22 +43,20 @@ def test_inference(self):
         sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
         sde_ve.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = sde_ve(
-            num_inference_steps=2, output_type="numpy",
-            generator=generator).images
+        image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sde_ve(
             num_inference_steps=2,
             output_type="numpy",
             generator=generator,
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
         expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
 
 @slow
@@ -70,9 +69,7 @@ def test_inference(self):
         sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
         sde_ve.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = sde_ve(
-            num_inference_steps=10, output_type="numpy",
-            generator=generator).images
+        image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 256, 256, 3)
         expected_slice = np.array([1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0])
diff --git a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
index 6188cab488e6a..cf7e0a7ba17a7 100644
--- a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
+++ b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
@@ -22,10 +22,16 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, UNet2DConditionModel)
-from ppdiffusers.pipelines.semantic_stable_diffusion import \
-    SemanticStableDiffusionPipeline as StableDiffusionPipeline
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.semantic_stable_diffusion import (
+    SemanticStableDiffusionPipeline as StableDiffusionPipeline,
+)
 from ppdiffusers.utils import floats_tensor, nightly
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -41,8 +47,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = 32, 32
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     @property
@@ -56,7 +61,8 @@ def dummy_cond_unet(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         return model
 
     @property
@@ -68,7 +74,8 @@ def dummy_vae(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         return model
 
     @property
@@ -83,7 +90,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModel(config).eval()
 
     @property
@@ -108,11 +116,11 @@ def test_semantic_diffusion_ddim(self):
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd_pipe = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -120,7 +128,8 @@ def test_semantic_diffusion_ddim(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -129,7 +138,8 @@ def test_semantic_diffusion_ddim(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -138,29 +148,31 @@ def test_semantic_diffusion_ddim(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.28401083,
-            0.23724163,
-            0.38141036,
-            0.2201719,
-            0.26111937,
-            0.5176592,
-            0.25668317,
-            0.25036532,
-            0.47986418,
-        ])
+        expected_slice = np.array(
+            [
+                0.28401083,
+                0.23724163,
+                0.38141036,
+                0.2201719,
+                0.26111937,
+                0.5176592,
+                0.25668317,
+                0.25036532,
+                0.47986418,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_semantic_diffusion_no_safety_checker(self):
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
         assert isinstance(pipe, StableDiffusionPipeline)
         assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
         assert pipe.safety_checker is None
@@ -168,8 +180,7 @@ def test_semantic_diffusion_no_safety_checker(self):
         assert image is not None
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, from_diffusers=False)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
         assert pipe.safety_checker is None
         image = pipe("example prompt", num_inference_steps=2).images[0]
         assert image is not None
@@ -179,8 +190,7 @@ def test_semantic_diffusion_pndm(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd_pipe = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -188,7 +198,8 @@ def test_semantic_diffusion_pndm(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -197,7 +208,8 @@ def test_semantic_diffusion_pndm(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -206,24 +218,26 @@ def test_semantic_diffusion_pndm(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.18612236,
-            0.24176982,
-            0.36099488,
-            0.21807766,
-            0.27262795,
-            0.51991826,
-            0.22258872,
-            0.22143877,
-            0.4452843,
-        ])
+        expected_slice = np.array(
+            [
+                0.18612236,
+                0.24176982,
+                0.36099488,
+                0.21807766,
+                0.27262795,
+                0.51991826,
+                0.22258872,
+                0.22143877,
+                0.4452843,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.02
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.02
 
     def test_semantic_diffusion_fp16(self):
         """Test that stable diffusion works with fp16"""
@@ -231,8 +245,7 @@ def test_semantic_diffusion_fp16(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         unet = unet.to(dtype=paddle.float16)
         vae = vae.to(dtype=paddle.float16)
         bert = bert.to(dtype=paddle.float16)
@@ -243,11 +256,11 @@ def test_semantic_diffusion_fp16(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
-        image = sd_pipe(
-            [prompt], num_inference_steps=2, output_type="np").images
+        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
         assert image.shape == (1, 64, 64, 3)
 
 
@@ -260,8 +273,7 @@ def tearDown(self):
         # paddle.device.cuda.empty_cache()
 
     def test_positive_guidance(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         pipe.set_progress_bar_config(disable=None)
         prompt = "a photo of a cat"
         edit = {
@@ -283,7 +295,8 @@ def test_positive_guidance(self):
             num_inference_steps=50,
             output_type="np",
             width=512,
-            height=512, )
+            height=512,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -308,7 +321,8 @@ def test_positive_guidance(self):
             output_type="np",
             width=512,
             height=512,
-            **edit, )
+            **edit,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -326,8 +340,7 @@ def test_positive_guidance(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_negative_guidance(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         pipe.set_progress_bar_config(disable=None)
         prompt = "an image of a crowded boulevard, realistic, 4k"
         edit = {
@@ -349,7 +362,8 @@ def test_negative_guidance(self):
             num_inference_steps=50,
             output_type="np",
             width=512,
-            height=512, )
+            height=512,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -374,7 +388,8 @@ def test_negative_guidance(self):
             output_type="np",
             width=512,
             height=512,
-            **edit, )
+            **edit,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -392,13 +407,11 @@ def test_negative_guidance(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_multi_cond_guidance(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         pipe.set_progress_bar_config(disable=None)
         prompt = "a castle next to a river"
         edit = {
-            "editing_prompt":
-            ["boat on a river, boat", "monet, impression, sunrise"],
+            "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
             "reverse_editing_direction": False,
             "edit_warmup_steps": [15, 18],
             "edit_guidance_scale": 6,
@@ -416,7 +429,8 @@ def test_multi_cond_guidance(self):
             num_inference_steps=50,
             output_type="np",
             width=512,
-            height=512, )
+            height=512,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -441,7 +455,8 @@ def test_multi_cond_guidance(self):
             output_type="np",
             width=512,
             height=512,
-            **edit, )
+            **edit,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -459,8 +474,7 @@ def test_multi_cond_guidance(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
 
     def test_guidance_fp16(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         prompt = "a photo of a cat"
         edit = {
@@ -482,7 +496,8 @@ def test_guidance_fp16(self):
             num_inference_steps=50,
             output_type="np",
             width=512,
-            height=512, )
+            height=512,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -507,7 +522,8 @@ def test_guidance_fp16(self):
             output_type="np",
             width=512,
             height=512,
-            **edit, )
+            **edit,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
diff --git a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 9355b00dcdff0..465b997e0c007 100644
--- a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -18,15 +18,19 @@
 import numpy as np
 import paddle
 
-from ppdiffusers import (DDPMScheduler, MidiProcessor,
-                         SpectrogramDiffusionPipeline)
+from ppdiffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
 from ppdiffusers.pipelines.spectrogram_diffusion import (
-    SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder)
+    SpectrogramContEncoder,
+    SpectrogramNotesEncoder,
+    T5FilmDecoder,
+)
 from ppdiffusers.training_utils import enable_full_determinism
 from ppdiffusers.utils import require_paddle_gpu, slow
 
-from ..pipeline_params import (TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS,
-                               TOKENS_TO_AUDIO_GENERATION_PARAMS)
+from ..pipeline_params import (
+    TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS,
+    TOKENS_TO_AUDIO_GENERATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 enable_full_determinism(42)
@@ -38,8 +42,7 @@
 # is not compatible with python 3.8 which we run in the CI.
 # https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:98
 # @unittest.skip("The note-seq package currently throws an error on import")
-class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin,
-                                            unittest.TestCase):
+class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = SpectrogramDiffusionPipeline
     required_optional_params = PipelineTesterMixin.required_optional_params - {
         "callback",
@@ -65,7 +68,8 @@ def get_dummy_components(self):
             num_heads=1,
             d_kv=4,
             d_ff=2048,
-            feed_forward_proj="gated-gelu", )
+            feed_forward_proj="gated-gelu",
+        )
         notes_encoder.eval()
         paddle.seed(0)
         continuous_encoder = SpectrogramContEncoder(
@@ -77,7 +81,8 @@ def get_dummy_components(self):
             num_heads=1,
             d_kv=4,
             d_ff=2048,
-            feed_forward_proj="gated-gelu", )
+            feed_forward_proj="gated-gelu",
+        )
         continuous_encoder.eval()
 
         paddle.seed(0)
@@ -90,7 +95,8 @@ def get_dummy_components(self):
             num_heads=1,
             d_kv=4,
             d_ff=2048,
-            dropout_rate=0.1, )
+            dropout_rate=0.1,
+        )
         decoder.eval()
 
         scheduler = DDPMScheduler()
@@ -108,23 +114,26 @@ def get_dummy_inputs(self, seed=0):
 
         generator = paddle.Generator().manual_seed(seed)
         inputs = {
-            "input_tokens": [[
-                1134,
-                90,
-                1135,
-                1133,
-                1080,
-                112,
-                1132,
-                1080,
-                1133,
-                1079,
-                133,
-                1132,
-                1079,
-                1133,
-                1,
-            ] + [0] * 2033],
+            "input_tokens": [
+                [
+                    1134,
+                    90,
+                    1135,
+                    1133,
+                    1080,
+                    112,
+                    1132,
+                    1080,
+                    1133,
+                    1079,
+                    133,
+                    1132,
+                    1079,
+                    1133,
+                    1,
+                ]
+                + [0] * 2033
+            ],
             "generator": generator,
             "num_inference_steps": 4,
             "output_type": "mel",
@@ -144,17 +153,19 @@ def test_spectrogram_diffusion(self):
         mel_slice = mel[0, -3:, -3:]
 
         assert mel_slice.shape == (3, 3)
-        expected_slice = np.array([
-            -11.46511,
-            4.0,
-            -8.506372,
-            -11.512925,
-            -11.512925,
-            -10.417862,
-            -8.077912,
-            3.7985802,
-            4.0,
-        ])
+        expected_slice = np.array(
+            [
+                -11.46511,
+                4.0,
+                -8.506372,
+                -11.512925,
+                -11.512925,
+                -10.417862,
+                -8.077912,
+                3.7985802,
+                4.0,
+            ]
+        )
         assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_save_load_local(self):
@@ -191,8 +202,7 @@ def tearDown(self):
     def test_callback(self):
         # TODO - test that pipeline can decode tokens in a callback
         # so that music can be played live
-        pipe = SpectrogramDiffusionPipeline.from_pretrained(
-            "google/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
         melgan = pipe.melgan
         pipe.melgan = None
 
@@ -215,12 +225,12 @@ def callback(step, mel_output):
             num_inference_steps=5,
             generator=generator,
             callback=callback,
-            output_type="mel", )
+            output_type="mel",
+        )
 
     def test_spectrogram_fast(self):
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained(
-            "google/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
         pipe.set_progress_bar_config(disable=None)
         processor = MidiProcessor()
 
@@ -237,8 +247,7 @@ def test_spectrogram_fast(self):
 
     def test_spectrogram(self):
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained(
-            "google/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
         pipe.set_progress_bar_config(disable=None)
 
         processor = MidiProcessor()
@@ -249,8 +258,7 @@ def test_spectrogram(self):
         input_tokens = input_tokens[:4]
 
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            input_tokens, num_inference_steps=100, generator=generator)
+        output = pipe(input_tokens, num_inference_steps=100, generator=generator)
 
         audio = output.audios[0]
         assert abs(np.abs(audio).sum() - 14418.089) < 5e-2
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index f5beae09ac46f..50c27ff574be4 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -21,13 +21,19 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    CycleDiffusionPipeline,
+    DDIMScheduler,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -39,11 +45,8 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "width",
         "negative_prompt_embeds",
     }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union(
-        {"source_prompt"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"})
 
     def get_dummy_components(self):
         paddle.seed(0)
@@ -55,14 +58,16 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             num_train_timesteps=1000,
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -70,7 +75,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -81,10 +87,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -123,17 +129,19 @@ def test_stable_diffusion_cycle(self):
         images = output.images
         image_slice = images[0, -3:, -3:, -1]
         assert images.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.04812625,
-            0.77983606,
-            0.71009433,
-            0.15924984,
-            0.9788434,
-            0.49732354,
-            0.362224,
-            0.6481595,
-            0.4530744,
-        ])
+        expected_slice = np.array(
+            [
+                0.04812625,
+                0.77983606,
+                0.71009433,
+                0.15924984,
+                0.9788434,
+                0.49732354,
+                0.362224,
+                0.6481595,
+                0.4530744,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_cycle_fp16(self):
@@ -148,17 +156,19 @@ def test_stable_diffusion_cycle_fp16(self):
         images = output.images
         image_slice = images[0, -3:, -3:, -1]
         assert images.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.05053711,
-            0.78125,
-            0.7114258,
-            0.15991211,
-            0.9785156,
-            0.49804688,
-            0.36279297,
-            0.6484375,
-            0.45361328,
-        ])
+        expected_slice = np.array(
+            [
+                0.05053711,
+                0.78125,
+                0.7114258,
+                0.15991211,
+                0.9785156,
+                0.49804688,
+                0.36279297,
+                0.6484375,
+                0.45361328,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     @unittest.skip("non-deterministic pipeline")
@@ -178,18 +188,17 @@ def test_cycle_diffusion_pipeline_fp16(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png"
         )
-        expected_image = np.array([[0.14477539, 0.20483398, 0.14135742],
-                                   [0.10009766, 0.17602539, 0.11083984]])
+        expected_image = np.array([[0.14477539, 0.20483398, 0.14135742], [0.10009766, 0.17602539, 0.11083984]])
         init_image = init_image.resize((512, 512))
         model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(
-            model_id, subfolder="scheduler")
+        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = CycleDiffusionPipeline.from_pretrained(
             model_id,
             scheduler=scheduler,
             safety_checker=None,
             paddle_dtype=paddle.float16,
-            revision="fp16", )
+            revision="fp16",
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         source_prompt = "A black colored car"
@@ -205,7 +214,8 @@ def test_cycle_diffusion_pipeline_fp16(self):
             guidance_scale=3,
             source_guidance_scale=1,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         assert np.abs(image[0][0][:2] - expected_image).max() < 0.5
 
@@ -213,14 +223,11 @@ def test_cycle_diffusion_pipeline(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png"
         )
-        expected_image = np.array([[0.16294342, 0.20514232, 0.14554858],
-                                   [0.11476257, 0.16831946, 0.11495486]])
+        expected_image = np.array([[0.16294342, 0.20514232, 0.14554858], [0.11476257, 0.16831946, 0.11495486]])
         init_image = init_image.resize((512, 512))
         model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(
-            model_id, subfolder="scheduler")
-        pipe = CycleDiffusionPipeline.from_pretrained(
-            model_id, scheduler=scheduler, safety_checker=None)
+        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         source_prompt = "A black colored car"
@@ -236,6 +243,7 @@ def test_cycle_diffusion_pipeline(self):
             guidance_scale=3,
             source_guidance_scale=1,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         assert np.abs(image[0][0][:2] - expected_image).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 184bd9f7b4927..042ad47fa00eb 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -22,10 +22,17 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline,
-    UNet2DConditionModel, logging)
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
 from ppdiffusers.utils import load_numpy, nightly, slow
 from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
 
@@ -49,13 +56,15 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -63,7 +72,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -74,10 +84,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -110,17 +120,19 @@ def test_stable_diffusion_ddim(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.28519553,
-            0.23807192,
-            0.38150552,
-            0.21930423,
-            0.26092762,
-            0.51721215,
-            0.25639117,
-            0.25039536,
-            0.47978917,
-        ])
+        expected_slice = np.array(
+            [
+                0.28519553,
+                0.23807192,
+                0.38150552,
+                0.21930423,
+                0.26092762,
+                0.51721215,
+                0.25639117,
+                0.25039536,
+                0.47978917,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_lora(self):
@@ -159,14 +171,14 @@ def test_stable_diffusion_prompt_embeds(self):
             padding="max_length",
             max_length=sd_pipe.tokenizer.model_max_length,
             truncation=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_inputs = text_inputs["input_ids"]
         prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
         inputs["prompt_embeds"] = prompt_embeds
         output = sd_pipe(**inputs)
         image_slice_2 = output.images[0, -3:, -3:, -1]
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max(
-        ) < 0.0001
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001
 
     def test_stable_diffusion_negative_prompt_embeds(self):
         components = self.get_dummy_components()
@@ -187,14 +199,14 @@ def test_stable_diffusion_negative_prompt_embeds(self):
                 padding="max_length",
                 max_length=sd_pipe.tokenizer.model_max_length,
                 truncation=True,
-                return_tensors="pd", )
+                return_tensors="pd",
+            )
             text_inputs = text_inputs["input_ids"]
             embeds.append(sd_pipe.text_encoder(text_inputs)[0])
         inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
         output = sd_pipe(**inputs)
         image_slice_2 = output.images[0, -3:, -3:, -1]
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max(
-        ) < 0.0001
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001
 
     def test_stable_diffusion_ddim_factor_8(self):
         components = self.get_dummy_components()
@@ -205,17 +217,19 @@ def test_stable_diffusion_ddim_factor_8(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 136, 136, 3)
-        expected_slice = np.array([
-            0.39545745,
-            0.94682777,
-            0.6828775,
-            0.42496994,
-            0.49475053,
-            0.48353004,
-            0.27300328,
-            0.30724254,
-            0.50566095,
-        ])
+        expected_slice = np.array(
+            [
+                0.39545745,
+                0.94682777,
+                0.6828775,
+                0.42496994,
+                0.49475053,
+                0.48353004,
+                0.27300328,
+                0.30724254,
+                0.50566095,
+            ]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
@@ -229,23 +243,25 @@ def test_stable_diffusion_pndm(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.18620703,
-            0.24143961,
-            0.3609084,
-            0.21810293,
-            0.27230006,
-            0.51992655,
-            0.22248471,
-            0.2213102,
-            0.44538254,
-        ])
+        expected_slice = np.array(
+            [
+                0.18620703,
+                0.24143961,
+                0.3609084,
+                0.21810293,
+                0.27230006,
+                0.51992655,
+                0.22248471,
+                0.2213102,
+                0.44538254,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_no_safety_checker(self):
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
         assert isinstance(pipe, StableDiffusionPipeline)
         assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
         assert pipe.safety_checker is None
@@ -253,8 +269,7 @@ def test_stable_diffusion_no_safety_checker(self):
         assert image is not None
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, from_diffusers=False)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
         assert pipe.safety_checker is None
         image = pipe("example prompt", num_inference_steps=2).images[0]
         assert image is not None
@@ -262,80 +277,82 @@ def test_stable_diffusion_no_safety_checker(self):
     def test_stable_diffusion_k_lms(self):
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         output = sd_pipe(**inputs)
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.29910105,
-            0.22905633,
-            0.37701294,
-            0.21332851,
-            0.26000416,
-            0.52840894,
-            0.25865072,
-            0.25947532,
-            0.47509664,
-        ])
+        expected_slice = np.array(
+            [
+                0.29910105,
+                0.22905633,
+                0.37701294,
+                0.21332851,
+                0.26000416,
+                0.52840894,
+                0.25865072,
+                0.25947532,
+                0.47509664,
+            ]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_k_euler_ancestral(self):
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         output = sd_pipe(**inputs)
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.29917336,
-            0.22854236,
-            0.37669897,
-            0.2137424,
-            0.25940597,
-            0.528258,
-            0.25919583,
-            0.2594489,
-            0.47522712,
-        ])
+        expected_slice = np.array(
+            [
+                0.29917336,
+                0.22854236,
+                0.37669897,
+                0.2137424,
+                0.25940597,
+                0.528258,
+                0.25919583,
+                0.2594489,
+                0.47522712,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_k_euler(self):
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         output = sd_pipe(**inputs)
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.29910135,
-            0.22905621,
-            0.3770129,
-            0.21332836,
-            0.26000386,
-            0.52840906,
-            0.2586509,
-            0.2594754,
-            0.47509673,
-        ])
+        expected_slice = np.array(
+            [
+                0.29910135,
+                0.22905621,
+                0.3770129,
+                0.21332836,
+                0.26000386,
+                0.52840906,
+                0.2586509,
+                0.2594754,
+                0.47509673,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_vae_slicing(self):
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components[
-            "scheduler"].config)
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         image_count = 4
@@ -346,9 +363,7 @@ def test_stable_diffusion_vae_slicing(self):
         inputs = self.get_dummy_inputs()
         inputs["prompt"] = [inputs["prompt"]] * image_count
         output_2 = sd_pipe(**inputs)
-        assert (
-            np.abs(output_2.images.flatten() - output_1.images.flatten()).max()
-            < 0.003)
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 0.003
 
     def test_stable_diffusion_vae_tiling(self):
         components = self.get_dummy_components()
@@ -367,7 +382,8 @@ def test_stable_diffusion_vae_tiling(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
 
         # make sure tiled vae decode yields the same result
         sd_pipe.enable_vae_tiling()
@@ -377,11 +393,10 @@ def test_stable_diffusion_vae_tiling(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
 
-        assert (
-            np.abs(output_2.images.flatten() - output_1.images.flatten()).max()
-            < 5e-1)
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
 
     def test_stable_diffusion_negative_prompt(self):
         components = self.get_dummy_components()
@@ -394,17 +409,19 @@ def test_stable_diffusion_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.16709289,
-            0.26912582,
-            0.35834038,
-            0.23045751,
-            0.30960953,
-            0.5324909,
-            0.20372942,
-            0.2368694,
-            0.43633103,
-        ])
+        expected_slice = np.array(
+            [
+                0.16709289,
+                0.26912582,
+                0.35834038,
+                0.23045751,
+                0.30960953,
+                0.5324909,
+                0.20372942,
+                0.2368694,
+                0.43633103,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_num_images_per_prompt(self):
@@ -416,59 +433,59 @@ def test_stable_diffusion_num_images_per_prompt(self):
         images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
         assert images.shape == (1, 64, 64, 3)
         batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size, num_inference_steps=2,
-            output_type="np").images
+        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
         assert images.shape == (batch_size, 64, 64, 3)
         num_images_per_prompt = 2
         images = sd_pipe(
             prompt,
             num_inference_steps=2,
             output_type="np",
-            num_images_per_prompt=num_images_per_prompt, ).images
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
         assert images.shape == (num_images_per_prompt, 64, 64, 3)
         batch_size = 2
         images = sd_pipe(
             [prompt] * batch_size,
             num_inference_steps=2,
             output_type="np",
-            num_images_per_prompt=num_images_per_prompt, ).images
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
         assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
 
     def test_stable_diffusion_long_prompt(self):
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components[
-            "scheduler"].config)
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         do_classifier_free_guidance = True
         negative_prompt = None
         num_images_per_prompt = 1
-        logger = logging.get_logger(
-            "ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
         prompt = 25 * "@"
         with CaptureLogger(logger) as cap_logger_3:
             text_embeddings_3 = sd_pipe._encode_prompt(
                 prompt,
                 num_images_per_prompt,
                 do_classifier_free_guidance,
-                negative_prompt, )
+                negative_prompt,
+            )
         prompt = 100 * "@"
         with CaptureLogger(logger) as cap_logger:
             text_embeddings = sd_pipe._encode_prompt(
                 prompt,
                 num_images_per_prompt,
                 do_classifier_free_guidance,
-                negative_prompt, )
+                negative_prompt,
+            )
         negative_prompt = "Hello"
         with CaptureLogger(logger) as cap_logger_2:
             text_embeddings_2 = sd_pipe._encode_prompt(
                 prompt,
                 num_images_per_prompt,
                 do_classifier_free_guidance,
-                negative_prompt, )
-        assert (text_embeddings_3.shape == text_embeddings_2.shape ==
-                text_embeddings.shape)
+                negative_prompt,
+            )
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
         assert text_embeddings.shape[1] == 77
         assert cap_logger.out == cap_logger_2.out
         assert cap_logger.out.count("@") == 25
@@ -476,20 +493,14 @@ def test_stable_diffusion_long_prompt(self):
 
     def test_stable_diffusion_height_width_opt(self):
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components[
-            "scheduler"].config)
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "hey"
         output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
         image_shape = output.images[0].shape[:2]
         assert image_shape == (64, 64)
-        output = sd_pipe(
-            prompt,
-            num_inference_steps=1,
-            height=96,
-            width=96,
-            output_type="np")
+        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
         image_shape = output.images[0].shape[:2]
         assert image_shape == (96, 96)
         config = dict(sd_pipe.unet.config)
@@ -523,113 +534,116 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_stable_diffusion_1_1_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-1")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.43625,
-            0.43554,
-            0.3667,
-            0.4066,
-            0.39703,
-            0.38658,
-            0.43936,
-            0.43557,
-            0.40592,
-        ])
+        expected_slice = np.array(
+            [
+                0.43625,
+                0.43554,
+                0.3667,
+                0.4066,
+                0.39703,
+                0.38658,
+                0.43936,
+                0.43557,
+                0.40592,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.574,
-            0.47841,
-            0.31625,
-            0.63583,
-            0.58306,
-            0.55056,
-            0.50825,
-            0.56306,
-            0.55748,
-        ])
+        expected_slice = np.array(
+            [
+                0.574,
+                0.47841,
+                0.31625,
+                0.63583,
+                0.58306,
+                0.55056,
+                0.50825,
+                0.56306,
+                0.55748,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.38019,
-            0.28647,
-            0.27321,
-            0.40377,
-            0.3829,
-            0.35446,
-            0.39218,
-            0.38165,
-            0.42239,
-        ])
+        expected_slice = np.array(
+            [
+                0.38019,
+                0.28647,
+                0.27321,
+                0.40377,
+                0.3829,
+                0.35446,
+                0.39218,
+                0.38165,
+                0.42239,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.10542,
-            0.0962,
-            0.07332,
-            0.09015,
-            0.09382,
-            0.07597,
-            0.08496,
-            0.07806,
-            0.06455,
-        ])
+        expected_slice = np.array(
+            [
+                0.10542,
+                0.0962,
+                0.07332,
+                0.09015,
+                0.09382,
+                0.07597,
+                0.08496,
+                0.07806,
+                0.06455,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.03503,
-            0.03494,
-            0.01087,
-            0.03128,
-            0.02552,
-            0.00803,
-            0.00742,
-            0.00372,
-            0.0,
-        ])
+        expected_slice = np.array(
+            [
+                0.03503,
+                0.03494,
+                0.01087,
+                0.03128,
+                0.02552,
+                0.00803,
+                0.00742,
+                0.00372,
+                0.0,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     # def test_stable_diffusion_attention_slicing(self):
@@ -670,8 +684,7 @@ def test_stable_diffusion_dpm(self):
     #     assert np.abs(image_sliced - image).max() < 0.01
 
     def test_stable_diffusion_fp16_vs_autocast(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs(dtype="float16")
         image_fp16 = pipe(**inputs).images
@@ -684,8 +697,7 @@ def test_stable_diffusion_fp16_vs_autocast(self):
     def test_stable_diffusion_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -693,40 +705,41 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.5693,
-                    -0.3018,
-                    -0.9746,
-                    0.0518,
-                    -0.877,
-                    0.7559,
-                    -1.7402,
-                    0.1022,
-                    1.1582,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        -0.5693,
+                        -0.3018,
+                        -0.9746,
+                        0.0518,
+                        -0.877,
+                        0.7559,
+                        -1.7402,
+                        0.1022,
+                        1.1582,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.1958,
-                    -0.2993,
-                    -1.0166,
-                    -0.5005,
-                    -0.481,
-                    0.6162,
-                    -0.9492,
-                    0.6621,
-                    1.4492,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        -0.1958,
+                        -0.2993,
+                        -1.0166,
+                        -0.5005,
+                        -0.481,
+                        0.6162,
+                        -0.9492,
+                        0.6621,
+                        1.4492,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(dtype="float16")
@@ -758,8 +771,7 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
@@ -770,22 +782,22 @@ def test_stable_diffusion_1_4_pndm(self):
         assert max_diff < 0.001
 
     def test_stable_diffusion_1_5_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([
-            [0.7839468, 0.6564859, 0.48896512],
-            [0.78088367, 0.6400461, 0.447728],
-            [0.81458974, 0.67865074, 0.51496047],
-        ])
+        expected_image = np.array(
+            [
+                [0.7839468, 0.6564859, 0.48896512],
+                [0.78088367, 0.6400461, 0.447728],
+                [0.81458974, 0.67865074, 0.51496047],
+            ]
+        )
         max_diff = np.abs(expected_image - image[0][0:3]).max()
         assert max_diff < 0.001
 
     def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
@@ -797,10 +809,8 @@ def test_stable_diffusion_ddim(self):
         assert max_diff < 0.001
 
     def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
@@ -811,34 +821,34 @@ def test_stable_diffusion_lms(self):
         assert max_diff < 0.001
 
     def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([
-            [0.7907467, 0.69895816, 0.5911293],
-            [0.7878128, 0.6815276, 0.55695873],
-            [0.79491043, 0.69076216, 0.58900857],
-        ])
+        expected_image = np.array(
+            [
+                [0.7907467, 0.69895816, 0.5911293],
+                [0.7878128, 0.6815276, 0.55695873],
+                [0.79491043, 0.69076216, 0.58900857],
+            ]
+        )
         max_diff = np.abs(expected_image - image[0][0:3]).max()
         assert max_diff < 0.001
 
     def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         inputs["num_inference_steps"] = 25
         image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([
-            [0.8398815, 0.7510048, 0.6475117],
-            [0.8548264, 0.75703114, 0.63529825],
-            [0.8559129, 0.75676, 0.6597851],
-        ])
+        expected_image = np.array(
+            [
+                [0.8398815, 0.7510048, 0.6475117],
+                [0.8548264, 0.75703114, 0.63529825],
+                [0.8559129, 0.75676, 0.6597851],
+            ]
+        )
         max_diff = np.abs(expected_image - image[0][0:3]).max()
         assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
index 5c19060a6d83a..4a6a51ef4cefb 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
@@ -20,19 +20,24 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, PNDMScheduler,
-                         StableDiffusionAdapterPipeline, T2IAdapter,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    PNDMScheduler,
+    StableDiffusionAdapterPipeline,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, load_numpy, slow
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin,
-                                              unittest.TestCase):
+class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionAdapterPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
@@ -47,7 +52,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.Generator().manual_seed(seed=0)
         vae = AutoencoderKL(
@@ -56,7 +62,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         vae_scale_factor = 2
         paddle.Generator().manual_seed(seed=0)
         text_encoder_config = CLIPTextConfig(
@@ -68,10 +75,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         paddle.Generator().manual_seed(seed=0)
         adapter = T2IAdapter(
             block_out_channels=[32, 64],
@@ -80,7 +87,8 @@ def get_dummy_components(self):
             kernel_size=1,
             res_block_skip=True,
             use_conv=False,
-            input_scale_factor=vae_scale_factor, )
+            input_scale_factor=vae_scale_factor,
+        )
         components = {
             "adapter": adapter,
             "unet": unet,
@@ -114,30 +122,30 @@ def test_stable_diffusion_adapter_default_case(self):
         image = sd_pipe(**inputs).images
         image_slice = image[(0), -3:, -3:, (-1)]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.9088084,
-            0.6012194,
-            0.43046606,
-            0.7228667,
-            0.46428588,
-            0.30164504,
-            0.508494,
-            0.6241546,
-            0.55453974,
-        ])
+        expected_slice = np.array(
+            [
+                0.9088084,
+                0.6012194,
+                0.43046606,
+                0.7228667,
+                0.46428588,
+                0.30164504,
+                0.508494,
+                0.6241546,
+                0.55453974,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
 
     def test_attention_slicing_forward_pass(self):
-        return self._test_attention_slicing_forward_pass(
-            expected_max_diff=0.002)
+        return self._test_attention_slicing_forward_pass(expected_max_diff=0.002)
 
     @unittest.skipIf(
         not is_ppxformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
     )
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=0.002)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=0.002)
 
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=0.002)
@@ -153,16 +161,12 @@ def tearDown(self):
     def get_inputs(self, revision="segmentation", dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
         image_urls = {
-            "segmentation":
-            "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png",
-            "keypose":
-            "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png",
-            "depth":
-            "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png",
+            "segmentation": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png",
+            "keypose": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png",
+            "depth": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png",
         }
         prompt_by_rev = {
-            "segmentation":
-            "A black Honda motorcycle parked in front of a garage",
+            "segmentation": "A black Honda motorcycle parked in front of a garage",
             "keypose": "An astronaut on the moon",
             "depth": "An office room with nice view",
         }
@@ -180,9 +184,8 @@ def get_inputs(self, revision="segmentation", dtype="float32", seed=0):
     def test_stable_diffusion_segmentation_adapter(self):
         adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-seg")
         pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            adapter=adapter,
-            safety_checker=None)
+            "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(revision="segmentation")
@@ -196,9 +199,8 @@ def test_stable_diffusion_segmentation_adapter(self):
     def test_stable_diffusion_keypose_adapter(self):
         adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-keypose")
         pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            adapter=adapter,
-            safety_checker=None)
+            "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(revision="keypose")
@@ -212,9 +214,8 @@ def test_stable_diffusion_keypose_adapter(self):
     def test_stable_diffusion_depth_adapter(self):
         adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-depth")
         pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            adapter=adapter,
-            safety_checker=None)
+            "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(revision="depth")
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index ddebfd6234a13..8b85c7bd484db 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -19,9 +19,13 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler,
-                         StableDiffusionControlNetPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDIMScheduler,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import load_image, load_numpy, randn_tensor, slow
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
@@ -30,8 +34,7 @@
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin,
-                                                 unittest.TestCase):
+class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionControlNetPipeline
     params = TEXT_TO_IMAGE_PARAMS
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -46,7 +49,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         paddle.seed(0)
         controlnet = ControlNetModel(
             block_out_channels=(32, 64),
@@ -54,14 +58,16 @@ def get_dummy_components(self):
             in_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32), )
+            conditioning_embedding_out_channels=(16, 32),
+        )
         paddle.seed(0)
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -69,7 +75,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -80,10 +87,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         components = {
             "unet": unet,
@@ -106,8 +113,10 @@ def get_dummy_inputs(self, seed=0):
                 1,
                 3,
                 32 * controlnet_embedder_scale_factor,
-                32 * controlnet_embedder_scale_factor, ),
-            generator=generator, )
+                32 * controlnet_embedder_scale_factor,
+            ),
+            generator=generator,
+        )
 
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
@@ -128,8 +137,7 @@ def test_attention_slicing_forward_pass(self):
         reason="XFormers attention is only available with CUDA and `xformers` installed",
     )
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            expected_max_diff=1e-2)
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-2)
 
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=2e-3)
@@ -144,13 +152,11 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-canny")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(0)
@@ -172,13 +178,11 @@ def test_canny(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_depth(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-depth")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(0)
@@ -200,13 +204,11 @@ def test_depth(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_hed(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-hed")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(0)
@@ -228,13 +230,11 @@ def test_hed(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_mlsd(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-mlsd")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(0)
@@ -256,13 +256,11 @@ def test_mlsd(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_normal(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-normal")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(0)
@@ -284,13 +282,11 @@ def test_normal(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_openpose(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-openpose")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(0)
@@ -312,13 +308,11 @@ def test_openpose(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_scribble(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-scribble")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(5)
@@ -340,13 +334,11 @@ def test_scribble(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_seg(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-seg")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            safety_checker=None,
-            controlnet=controlnet)
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
         pipe.set_progress_bar_config(disable=None)
 
         generator = paddle.Generator().manual_seed(5)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 8739a78286b5f..a73cfcdbf1291 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -19,24 +19,28 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers import (CLIPImageProcessor, CLIPVisionConfig,
-                                    CLIPVisionModelWithProjection)
+from paddlenlp.transformers import (
+    CLIPImageProcessor,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, DPMSolverMultistepScheduler,
-                         PNDMScheduler, StableDiffusionImageVariationPipeline,
-                         UNet2DConditionModel)
-from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly,
-                               slow)
+from ppdiffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    PNDMScheduler,
+    StableDiffusionImageVariationPipeline,
+    UNet2DConditionModel,
+)
+from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (IMAGE_VARIATION_BATCH_PARAMS,
-                               IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin,
-                                                     unittest.TestCase):
+class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionImageVariationPipeline
     params = IMAGE_VARIATION_PARAMS
     batch_params = IMAGE_VARIATION_BATCH_PARAMS
@@ -51,7 +55,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -60,7 +65,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         image_encoder_config = CLIPVisionConfig(
             hidden_size=32,
@@ -70,7 +76,8 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             image_size=32,
-            patch_size=4, )
+            patch_size=4,
+        )
         image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
         feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
         components = {
@@ -106,17 +113,19 @@ def test_stable_diffusion_img_variation_default_case(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.22073305,
-            0.22751817,
-            0.32176197,
-            0.26315716,
-            0.25681925,
-            0.41432184,
-            0.2454437,
-            0.10104704,
-            0.32165903,
-        ])
+        expected_slice = np.array(
+            [
+                0.22073305,
+                0.22751817,
+                0.32176197,
+                0.26315716,
+                0.25681925,
+                0.41432184,
+                0.2454437,
+                0.10104704,
+                0.32165903,
+            ]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
@@ -130,17 +139,19 @@ def test_stable_diffusion_img_variation_multiple_images(self):
         image = output.images
         image_slice = image[-1, -3:, -3:, -1]
         assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array([
-            0.61040395,
-            0.7414253,
-            0.5950623,
-            0.5843509,
-            0.25609648,
-            0.28481025,
-            0.61782926,
-            0.3014974,
-            0.35131538,
-        ])
+        expected_slice = np.array(
+            [
+                0.61040395,
+                0.7414253,
+                0.5950623,
+                0.5843509,
+                0.25609648,
+                0.28481025,
+                0.61782926,
+                0.3014974,
+                0.35131538,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
 
@@ -154,9 +165,7 @@ def tearDown(self):
 
     def get_inputs(self, dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png"
-        )
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png")
         latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
         latents = paddle.to_tensor(latents).cast(dtype)
         inputs = {
@@ -171,30 +180,32 @@ def get_inputs(self, dtype="float32", seed=0):
 
     def test_stable_diffusion_img_variation_pipeline_default(self):
         sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers", safety_checker=None)
+            "fusing/sd-image-variations-diffusers", safety_checker=None
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.5717014670372009,
-            0.47024625539779663,
-            0.47462183237075806,
-            0.6388776898384094,
-            0.5250844359397888,
-            0.500831663608551,
-            0.638043999671936,
-            0.5769134163856506,
-            0.5223015546798706,
-        ])
+        expected_slice = np.array(
+            [
+                0.5717014670372009,
+                0.47024625539779663,
+                0.47462183237075806,
+                0.6388776898384094,
+                0.5250844359397888,
+                0.500831663608551,
+                0.638043999671936,
+                0.5769134163856506,
+                0.5223015546798706,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_img_variation_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -202,42 +213,45 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.1621,
-                    0.2837,
-                    -0.7979,
-                    -0.1221,
-                    -1.3057,
-                    0.7681,
-                    -2.1191,
-                    0.0464,
-                    1.6309,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        -0.1621,
+                        0.2837,
+                        -0.7979,
+                        -0.1221,
+                        -1.3057,
+                        0.7681,
+                        -2.1191,
+                        0.0464,
+                        1.6309,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.6299,
-                    1.75,
-                    1.1992,
-                    -2.1582,
-                    -1.8994,
-                    0.7334,
-                    -0.709,
-                    1.0137,
-                    1.5273,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.6299,
+                        1.75,
+                        1.1992,
+                        -2.1582,
+                        -1.8994,
+                        0.7334,
+                        -0.709,
+                        1.0137,
+                        1.5273,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionImageVariationPipeline.from_pretrained(
             "fusing/sd-image-variations-diffusers",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(dtype="float16")
@@ -256,9 +270,7 @@ def tearDown(self):
 
     def get_inputs(self, dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png"
-        )
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png")
         latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
         latents = paddle.to_tensor(latents).cast(dtype)
         inputs = {
@@ -272,28 +284,21 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_img_variation_pndm(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers")
+        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_img_variation_dpm(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         inputs["num_inference_steps"] = 25
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 86f394a233323..101468b9a4534 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -21,27 +21,30 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         DPMSolverMultistepScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionImg2ImgPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly,
-                               slow)
+from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin,
-                                              unittest.TestCase):
+class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionImg2ImgPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
 
     def get_dummy_components(self):
@@ -54,7 +57,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -63,7 +67,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -74,10 +79,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -101,8 +106,7 @@ def get_dummy_inputs(self, seed=0, input_image_type="pd", output_type="np"):
             input_image = image.numpy().transpose(0, 2, 3, 1)
             input_image = VaeImageProcessor.numpy_to_pil(input_image)
         else:
-            raise ValueError(
-                f"unsupported input_image_type {input_image_type}.")
+            raise ValueError(f"unsupported input_image_type {input_image_type}.")
 
         if output_type not in ["pd", "np", "pil"]:
             raise ValueError(f"unsupported output_type {output_type}")
@@ -125,17 +129,19 @@ def test_stable_diffusion_img2img_default_case(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.50082374,
-            0.49329656,
-            0.4963757,
-            0.46307105,
-            0.44599247,
-            0.4877512,
-            0.560709,
-            0.56884044,
-            0.5738671,
-        ])
+        expected_slice = np.array(
+            [
+                0.50082374,
+                0.49329656,
+                0.4963757,
+                0.46307105,
+                0.44599247,
+                0.4877512,
+                0.560709,
+                0.56884044,
+                0.5738671,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_img2img_negative_prompt(self):
@@ -149,17 +155,19 @@ def test_stable_diffusion_img2img_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.48659712,
-            0.4004616,
-            0.4762491,
-            0.49117112,
-            0.5414775,
-            0.58218545,
-            0.5550886,
-            0.52305603,
-            0.61624044,
-        ])
+        expected_slice = np.array(
+            [
+                0.48659712,
+                0.4004616,
+                0.4762491,
+                0.49117112,
+                0.5414775,
+                0.58218545,
+                0.5550886,
+                0.52305603,
+                0.61624044,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_img2img_multiple_init_images(self):
@@ -173,40 +181,45 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
         image = sd_pipe(**inputs).images
         image_slice = image[-1, -3:, -3:, -1]
         assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([
-            0.49016288,
-            0.23989454,
-            0.4229045,
-            0.56873804,
-            0.467226,
-            0.5793949,
-            0.6967555,
-            0.7027658,
-            0.5809763,
-        ])
+        expected_slice = np.array(
+            [
+                0.49016288,
+                0.23989454,
+                0.4229045,
+                0.56873804,
+                0.467226,
+                0.5793949,
+                0.6967555,
+                0.7027658,
+                0.5809763,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_img2img_k_lms(self):
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.29999942,
-            0.5206376,
-            0.37915814,
-            0.4033721,
-            0.7630579,
-            0.4642547,
-            0.5823178,
-            0.6936951,
-            0.48969278,
-        ])
+        expected_slice = np.array(
+            [
+                0.29999942,
+                0.5206376,
+                0.37915814,
+                0.4033721,
+                0.7630579,
+                0.4642547,
+                0.5823178,
+                0.6936951,
+                0.48969278,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_pt_np_pil_outputs_equivalent(self):
@@ -218,10 +231,8 @@ def test_pt_np_pil_outputs_equivalent(self):
         output_np = sd_pipe(**self.get_dummy_inputs(output_type="np"))[0]
         output_pil = sd_pipe(**self.get_dummy_inputs(output_type="pil"))[0]
 
-        assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max(
-        ) <= 1e-4
-        assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max(
-        ) <= 1e-4
+        assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
+        assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
 
     def test_image_types_consistent(self):
         components = self.get_dummy_components()
@@ -245,9 +256,7 @@ def tearDown(self):
 
     def get_inputs(self, dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png"
-        )
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png")
         inputs = {
             "prompt": "a fantasy landscape, concept art, high resolution",
             "image": init_image,
@@ -286,25 +295,26 @@ def get_inputs(self, dtype="float32", seed=0):
     #     assert mean_diff < 5e-2
 
     def test_stable_diffusion_img2img_default(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([
-            0.27150,
-            0.14849,
-            0.15605,
-            0.26740,
-            0.16954,
-            0.18204,
-            0.31470,
-            0.26311,
-            0.24525,
-        ])
+        expected_slice = np.array(
+            [
+                0.27150,
+                0.14849,
+                0.15605,
+                0.26740,
+                0.16954,
+                0.18204,
+                0.31470,
+                0.26311,
+                0.24525,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.001
 
     # def test_img2img_safety_checker_works(self):
@@ -322,8 +332,7 @@ def test_stable_diffusion_img2img_default(self):
     #     assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros
 
     def test_stable_diffusion_img2img_k_lms(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -331,22 +340,23 @@ def test_stable_diffusion_img2img_k_lms(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([
-            0.04890,
-            0.04862,
-            0.06422,
-            0.04655,
-            0.05108,
-            0.05307,
-            0.05926,
-            0.08759,
-            0.06852,
-        ])
+        expected_slice = np.array(
+            [
+                0.04890,
+                0.04862,
+                0.06422,
+                0.04655,
+                0.05108,
+                0.05307,
+                0.05926,
+                0.08759,
+                0.06852,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.001
 
     def test_stable_diffusion_img2img_ddim(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -354,24 +364,25 @@ def test_stable_diffusion_img2img_ddim(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([
-            0.06069,
-            0.05703,
-            0.08054,
-            0.05797,
-            0.06286,
-            0.06234,
-            0.08438,
-            0.11151,
-            0.08068,
-        ])
+        expected_slice = np.array(
+            [
+                0.06069,
+                0.05703,
+                0.08054,
+                0.05797,
+                0.06286,
+                0.06234,
+                0.08438,
+                0.11151,
+                0.08068,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.001
 
     def test_stable_diffusion_img2img_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -379,42 +390,45 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 96)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.7650054097175598,
-                    0.10256098955869675,
-                    0.4976114332675934,
-                    3.388350009918213,
-                    3.7242040634155273,
-                    4.272988796234131,
-                    2.4656283855438232,
-                    3.483647108078003,
-                    1.765011191368103,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.7650054097175598,
+                        0.10256098955869675,
+                        0.4976114332675934,
+                        3.388350009918213,
+                        3.7242040634155273,
+                        4.272988796234131,
+                        2.4656283855438232,
+                        3.483647108078003,
+                        1.765011191368103,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 96)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.7580092549324036,
-                    0.10288780182600021,
-                    0.4941849708557129,
-                    3.3663346767425537,
-                    3.7071609497070312,
-                    4.25173807144165,
-                    2.4461638927459717,
-                    3.451681137084961,
-                    1.761878490447998,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.7580092549324036,
+                        0.10288780182600021,
+                        0.4941849708557129,
+                        3.3663346767425537,
+                        3.7071609497070312,
+                        4.25173807144165,
+                        2.4461638927459717,
+                        3.451681137084961,
+                        1.761878490447998,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(dtype="float16")
@@ -423,13 +437,10 @@ def callback_fn(step: int, timestep: int,
         assert number_of_steps == 2
 
     def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg"
-        )
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg")
         init_image = init_image.resize((760, 504))
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            model_id, safety_checker=None)
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         prompt = "A fantasy landscape, trending on artstation"
@@ -440,21 +451,24 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
             strength=0.75,
             guidance_scale=7.5,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images[0]
         image_slice = image[255:258, 383:386, -1]
         assert image.shape == (504, 760, 3)
-        expected_slice = np.array([
-            0.71240354,
-            0.71053374,
-            0.69922864,
-            0.7139934,
-            0.7106118,
-            0.69451976,
-            0.71982634,
-            0.71717453,
-            0.70306426,
-        ])
+        expected_slice = np.array(
+            [
+                0.71240354,
+                0.71053374,
+                0.69922864,
+                0.7139934,
+                0.7106118,
+                0.69451976,
+                0.71982634,
+                0.71717453,
+                0.70306426,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
 
 
@@ -468,9 +482,7 @@ def tearDown(self):
 
     def get_inputs(self, dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png"
-        )
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png")
         inputs = {
             "prompt": "a fantasy landscape, concept art, high resolution",
             "image": init_image,
@@ -483,59 +495,45 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_img2img_pndm(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_img2img_ddim(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_img2img_lms(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_img2img_dpm(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         inputs["num_inference_steps"] = 30
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 70688fa0182a1..0a815f465532b 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -22,22 +22,28 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, DPMSolverMultistepScheduler,
-                         LMSDiscreteScheduler, PNDMScheduler,
-                         StableDiffusionInpaintPipeline, UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import \
-    prepare_mask_and_masked_image
-from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly,
-                               slow)
+from ppdiffusers import (
+    AutoencoderKL,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionInpaintPipeline,
+    UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
+    prepare_mask_and_masked_image,
+)
+from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin,
-                                              unittest.TestCase):
+class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionInpaintPipeline
     params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
     batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
@@ -52,7 +58,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -61,7 +68,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -72,10 +80,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -90,11 +98,8 @@ def get_dummy_components(self):
     def get_dummy_inputs(self, seed=0):
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
         image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (64, 64))
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (64, 64)))
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
         generator = paddle.Generator().manual_seed(seed)
 
         inputs = {
@@ -116,17 +121,19 @@ def test_stable_diffusion_inpaint(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.55786943,
-            0.628228,
-            0.49147403,
-            0.3191774,
-            0.39249492,
-            0.46521175,
-            0.29909956,
-            0.21160087,
-            0.42932406,
-        ])
+        expected_slice = np.array(
+            [
+                0.55786943,
+                0.628228,
+                0.49147403,
+                0.3191774,
+                0.39249492,
+                0.46521175,
+                0.29909956,
+                0.21160087,
+                0.42932406,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_inpaint_image_tensor(self):
@@ -138,11 +145,11 @@ def test_stable_diffusion_inpaint_image_tensor(self):
         out_pil = output.images
         inputs = self.get_dummy_inputs()
         inputs["image"] = (
-            paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1)
-            .transpose(perm=[2, 0, 1]).unsqueeze(axis=0))
+            paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1).transpose(perm=[2, 0, 1]).unsqueeze(axis=0)
+        )
         inputs["mask_image"] = (
-            paddle.to_tensor(np.array(inputs["mask_image"]) / 255)
-            .transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0))
+            paddle.to_tensor(np.array(inputs["mask_image"]) / 255).transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0)
+        )
         output = sd_pipe(**inputs)
         out_tensor = output.images
         assert out_pil.shape == (1, 64, 64, 3)
@@ -166,13 +173,10 @@ def tearDown(self):
 
     def get_inputs(self, dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
-        mask_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
+        mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
         inputs = {
-            "prompt":
-            "Face of a yellow cat, high resolution, sitting on a park bench",
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
             "image": init_image,
             "mask_image": mask_image,
             "generator": generator,
@@ -184,53 +188,60 @@ def get_inputs(self, dtype="float32", seed=0):
 
     def test_stable_diffusion_inpaint_ddim(self):
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None)
+            "runwayml/stable-diffusion-inpainting", safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.05978,
-            0.10983,
-            0.10514,
-            0.07922,
-            0.08483,
-            0.08587,
-            0.05302,
-            0.03218,
-            0.01636,
-        ])
+        expected_slice = np.array(
+            [
+                0.05978,
+                0.10983,
+                0.10514,
+                0.07922,
+                0.08483,
+                0.08587,
+                0.05302,
+                0.03218,
+                0.01636,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.0001
 
     def test_stable_diffusion_inpaint_fp16(self):
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             "runwayml/stable-diffusion-inpainting",
             paddle_dtype=paddle.float16,
-            safety_checker=None, )
+            safety_checker=None,
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(dtype="float16")
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.9921875,
-            0.9477539,
-            0.90234375,
-            0.96484375,
-            0.9189453,
-            0.875,
-            0.9316406,
-            0.9013672,
-            0.875,
-        ])
+        expected_slice = np.array(
+            [
+                0.9921875,
+                0.9477539,
+                0.90234375,
+                0.96484375,
+                0.9189453,
+                0.875,
+                0.9316406,
+                0.9013672,
+                0.875,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.05
 
     def test_stable_diffusion_inpaint_pndm(self):
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None)
+            "runwayml/stable-diffusion-inpainting", safety_checker=None
+        )
         pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -238,22 +249,25 @@ def test_stable_diffusion_inpaint_pndm(self):
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.06892,
-            0.06994,
-            0.07905,
-            0.05366,
-            0.04709,
-            0.04890,
-            0.04107,
-            0.05083,
-            0.04180,
-        ])
+        expected_slice = np.array(
+            [
+                0.06892,
+                0.06994,
+                0.07905,
+                0.05366,
+                0.04709,
+                0.04890,
+                0.04107,
+                0.05083,
+                0.04180,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.0001
 
     def test_stable_diffusion_inpaint_k_lms(self):
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None)
+            "runwayml/stable-diffusion-inpainting", safety_checker=None
+        )
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -261,17 +275,19 @@ def test_stable_diffusion_inpaint_k_lms(self):
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.23513,
-            0.22413,
-            0.29442,
-            0.24243,
-            0.26214,
-            0.30329,
-            0.26431,
-            0.25025,
-            0.25197,
-        ])
+        expected_slice = np.array(
+            [
+                0.23513,
+                0.22413,
+                0.29442,
+                0.24243,
+                0.26214,
+                0.30329,
+                0.26431,
+                0.25025,
+                0.25197,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.0001
 
 
@@ -285,13 +301,10 @@ def tearDown(self):
 
     def get_inputs(self, dtype="float32", seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
-        mask_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
+        mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
         inputs = {
-            "prompt":
-            "Face of a yellow cat, high resolution, sitting on a park bench",
+            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
             "image": init_image,
             "mask_image": mask_image,
             "generator": generator,
@@ -302,52 +315,40 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_inpaint_ddim(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting")
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_inpaint_pndm(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting")
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
         sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_inpaint_lms(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy"
-        )
+        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy")
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 0.001
 
     def test_inpaint_dpm(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
@@ -360,8 +361,7 @@ def test_inpaint_dpm(self):
         assert max_diff < 0.001
 
 
-class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(
-        unittest.TestCase):
+class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
     def test_pil_inputs(self):
         im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
         im = Image.fromarray(im)
@@ -389,8 +389,7 @@ def test_np_inputs(self):
         mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
         mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
         t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil,
-                                                                 mask_pil)
+        t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
         self.assertTrue((t_mask_np == t_mask_pil).all())
         self.assertTrue((t_masked_np == t_masked_pil).all())
 
@@ -401,7 +400,8 @@ def test_paddle_3D_2D_inputs(self):
         mask_np = mask_tensor.numpy()
 
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
         t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
 
         self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -413,7 +413,8 @@ def test_paddle_3D_3D_inputs(self):
         im_np = im_tensor.numpy().transpose(1, 2, 0)
         mask_np = mask_tensor.numpy()[0]
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
         t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
         self.assertTrue((t_mask_tensor == t_mask_np).all())
         self.assertTrue((t_masked_tensor == t_masked_np).all())
@@ -424,7 +425,8 @@ def test_paddle_4D_2D_inputs(self):
         im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
         mask_np = mask_tensor.numpy()
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
         t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
         self.assertTrue((t_mask_tensor == t_mask_np).all())
         self.assertTrue((t_masked_tensor == t_masked_np).all())
@@ -435,19 +437,20 @@ def test_paddle_4D_3D_inputs(self):
         im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
         mask_np = mask_tensor.numpy()[0]
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
         t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
         self.assertTrue((t_mask_tensor == t_mask_np).all())
         self.assertTrue((t_masked_tensor == t_masked_np).all())
 
     def test_paddle_4D_4D_inputs(self):
         im_tensor = paddle.randint(0, 255, (1, 3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255,
-                                     (1, 1, 32, 32)).cast("uint8") > 127.5
+        mask_tensor = paddle.randint(0, 255, (1, 1, 32, 32)).cast("uint8") > 127.5
         im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
         mask_np = mask_tensor.numpy()[0][0]
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
         t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
         self.assertTrue((t_mask_tensor == t_mask_np.cast("float64")).all())
         self.assertTrue((t_masked_tensor == t_masked_np).all())
@@ -458,11 +461,9 @@ def test_paddle_batch_4D_3D(self):
         im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
         mask_nps = [mask.numpy() for mask in mask_tensor]
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
-        nps = [
-            prepare_mask_and_masked_image(i, m)
-            for i, m in zip(im_nps, mask_nps)
-        ]
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
+        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
         t_mask_np = paddle.concat(x=[n[0] for n in nps])
         t_masked_np = paddle.concat(x=[n[1] for n in nps])
         self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -475,11 +476,9 @@ def test_paddle_batch_4D_4D(self):
         im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
         mask_nps = [mask.numpy() for mask in mask_tensor]
         t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64"))
-        nps = [
-            prepare_mask_and_masked_image(i, m)
-            for i, m in zip(im_nps, mask_nps)
-        ]
+            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
+        )
+        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
         t_mask_np = paddle.concat(x=[n[0] for n in nps])
         t_masked_np = paddle.concat(x=[n[1] for n in nps])
         self.assertTrue((t_mask_tensor == t_mask_np).all())
@@ -487,44 +486,28 @@ def test_paddle_batch_4D_4D(self):
 
     def test_shape_mismatch(self):
         with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(
-                paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64]))
+            prepare_mask_and_masked_image(paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64]))
         with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(
-                paddle.randn(shape=[2, 3, 32, 32]),
-                paddle.randn(shape=[4, 64, 64]))
+            prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 64, 64]))
         with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(
-                paddle.randn(shape=[2, 3, 32, 32]),
-                paddle.randn(shape=[4, 1, 64, 64]))
+            prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 1, 64, 64]))
 
     def test_type_mismatch(self):
         with self.assertRaises(TypeError):
-            prepare_mask_and_masked_image(
-                paddle.rand(shape=[3, 32, 32]),
-                paddle.rand(shape=[3, 32, 32]).numpy())
+            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.rand(shape=[3, 32, 32]).numpy())
         with self.assertRaises(TypeError):
-            prepare_mask_and_masked_image(
-                paddle.rand(shape=[3, 32, 32]).numpy(),
-                paddle.rand(shape=[3, 32, 32]))
+            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]).numpy(), paddle.rand(shape=[3, 32, 32]))
 
     def test_channels_first(self):
         with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(
-                paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32]))
+            prepare_mask_and_masked_image(paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32]))
 
     def test_tensor_range(self):
         with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(
-                paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32]))
+            prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32]))
         with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(
-                paddle.ones(shape=[3, 32, 32]) * -2,
-                paddle.rand(shape=[32, 32]))
+            prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * -2, paddle.rand(shape=[32, 32]))
         with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(
-                paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2)
+            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2)
         with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(
-                paddle.rand(shape=[3, 32, 32]),
-                paddle.ones(shape=[32, 32]) * -1)
+            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * -1)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 6866f1a367654..aef0082255467 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -22,13 +22,23 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         DPMSolverMultistepScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionInpaintPipelineLegacy,
-                         UNet2DConditionModel, UNet2DModel, VQModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionInpaintPipelineLegacy,
+    UNet2DConditionModel,
+    UNet2DModel,
+    VQModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, nightly, slow
-from ppdiffusers.utils.testing_utils import (load_numpy, preprocess_image,
-                                             require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import (
+    load_numpy,
+    preprocess_image,
+    require_paddle_gpu,
+)
 
 
 class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase):
@@ -42,8 +52,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = 32, 32
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     @property
@@ -56,7 +65,8 @@ def dummy_uncond_unet(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     @property
@@ -70,7 +80,8 @@ def dummy_cond_unet(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         return model
 
     @property
@@ -84,7 +95,8 @@ def dummy_cond_unet_inpaint(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         return model
 
     @property
@@ -96,7 +108,8 @@ def dummy_vq_model(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3, )
+            latent_channels=3,
+        )
         return model
 
     @property
@@ -108,7 +121,8 @@ def dummy_vae(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         return model
 
     @property
@@ -123,7 +137,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModel(config).eval()
 
     @property
@@ -146,13 +161,10 @@ def test_stable_diffusion_inpaint_legacy(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (32, 32)))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
         sd_pipe = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
@@ -160,7 +172,8 @@ def test_stable_diffusion_inpaint_legacy(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -171,7 +184,8 @@ def test_stable_diffusion_inpaint_legacy(self):
             num_inference_steps=2,
             output_type="np",
             image=init_image,
-            mask_image=mask_image, )
+            mask_image=mask_image,
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -182,32 +196,33 @@ def test_stable_diffusion_inpaint_legacy(self):
             output_type="np",
             image=init_image,
             mask_image=mask_image,
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.01514593,
-            0.46352747,
-            0.34991893,
-            0.29177475,
-            0.5415823,
-            0.56992227,
-            0.39533705,
-            0.67953515,
-            0.5445507,
-        ])
+        expected_slice = np.array(
+            [
+                0.01514593,
+                0.46352747,
+                0.34991893,
+                0.29177475,
+                0.5415823,
+                0.56992227,
+                0.39533705,
+                0.67953515,
+                0.5445507,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_inpaint_legacy_batched(self):
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         image = self.dummy_image.permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
@@ -222,7 +237,8 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
 
         prompt = "A painting of a squirrel eating a burger"
@@ -234,35 +250,40 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
             num_inference_steps=2,
             output_type="np",
             image=init_images_tens,
-            mask_image=init_masks_tens, ).images
+            mask_image=init_masks_tens,
+        ).images
 
         assert images.shape == (2, 32, 32, 3)
 
         image_slice_0 = images[0, -3:, -3:, -1].flatten()
         image_slice_1 = images[1, -3:, -3:, -1].flatten()
 
-        expected_slice_0 = np.array([
-            0.50299895,
-            0.6465979,
-            0.3489662,
-            0.28862774,
-            0.59657216,
-            0.41669005,
-            0.19621253,
-            0.27549136,
-            0.39040852,
-        ])
-        expected_slice_1 = np.array([
-            0.70079666,
-            0.5616544,
-            0.5304112,
-            0.38820785,
-            0.3118701,
-            0.47477302,
-            0.37215403,
-            0.3785481,
-            0.50153226,
-        ])
+        expected_slice_0 = np.array(
+            [
+                0.50299895,
+                0.6465979,
+                0.3489662,
+                0.28862774,
+                0.59657216,
+                0.41669005,
+                0.19621253,
+                0.27549136,
+                0.39040852,
+            ]
+        )
+        expected_slice_1 = np.array(
+            [
+                0.70079666,
+                0.5616544,
+                0.5304112,
+                0.38820785,
+                0.3118701,
+                0.47477302,
+                0.37215403,
+                0.3785481,
+                0.50153226,
+            ]
+        )
 
         assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2
         assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2
@@ -272,13 +293,10 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (32, 32)))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
         sd_pipe = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
@@ -286,7 +304,8 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         negative_prompt = "french fries"
@@ -299,21 +318,24 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
             num_inference_steps=2,
             output_type="np",
             image=init_image,
-            mask_image=mask_image, )
+            mask_image=mask_image,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.0,
-            0.43941003,
-            0.32130337,
-            0.31442684,
-            0.566114,
-            0.56392324,
-            0.3946159,
-            0.6844422,
-            0.5345681,
-        ])
+        expected_slice = np.array(
+            [
+                0.0,
+                0.43941003,
+                0.32130337,
+                0.31442684,
+                0.566114,
+                0.56392324,
+                0.3946159,
+                0.6844422,
+                0.5345681,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
@@ -321,13 +343,10 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (32, 32)))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
         sd_pipe = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
@@ -335,7 +354,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         images = sd_pipe(
@@ -343,7 +363,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
             num_inference_steps=2,
             output_type="np",
             image=init_image,
-            mask_image=mask_image, ).images
+            mask_image=mask_image,
+        ).images
         assert images.shape == (1, 32, 32, 3)
         batch_size = 2
         images = sd_pipe(
@@ -351,7 +372,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
             num_inference_steps=2,
             output_type="np",
             image=init_image,
-            mask_image=mask_image, ).images
+            mask_image=mask_image,
+        ).images
         assert images.shape == (batch_size, 32, 32, 3)
         num_images_per_prompt = 2
         images = sd_pipe(
@@ -360,7 +382,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
             output_type="np",
             image=init_image,
             mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt, ).images
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
         assert images.shape == (num_images_per_prompt, 32, 32, 3)
         batch_size = 2
         images = sd_pipe(
@@ -369,7 +392,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
             output_type="np",
             image=init_image,
             mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt, ).images
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
         assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
 
 
@@ -383,10 +407,8 @@ def tearDown(self):
 
     def get_inputs(self, seed=0):
         generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
-        mask_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
+        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
+        mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
         inputs = {
             "prompt": "A red cat sitting on a park bench",
             "image": init_image,
@@ -401,29 +423,33 @@ def get_inputs(self, seed=0):
 
     def test_stable_diffusion_inpaint_legacy_pndm(self):
         pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.27226633,
-            0.29068208,
-            0.3450312,
-            0.21444553,
-            0.26328486,
-            0.34392387,
-            0.18026042,
-            0.24961185,
-            0.3214044,
-        ])
+        expected_slice = np.array(
+            [
+                0.27226633,
+                0.29068208,
+                0.3450312,
+                0.21444553,
+                0.26328486,
+                0.34392387,
+                0.18026042,
+                0.24961185,
+                0.3214044,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.0001
 
     def test_stable_diffusion_inpaint_legacy_batched(self):
         pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
 
@@ -443,35 +469,40 @@ def test_stable_diffusion_inpaint_legacy_batched(self):
         image_slice_0 = image[0, 253:256, 253:256, -1].flatten()
         image_slice_1 = image[1, 253:256, 253:256, -1].flatten()
 
-        expected_slice_0 = np.array([
-            0.27526367,
-            0.29158682,
-            0.35184938,
-            0.21504477,
-            0.26708275,
-            0.35169,
-            0.18185198,
-            0.2572803,
-            0.32425082,
-        ])
-        expected_slice_1 = np.array([
-            0.0,
-            0.18929192,
-            0.7068148,
-            0.07977328,
-            0.13444492,
-            0.5016247,
-            0.49761847,
-            0.2830933,
-            0.36412603,
-        ])
+        expected_slice_0 = np.array(
+            [
+                0.27526367,
+                0.29158682,
+                0.35184938,
+                0.21504477,
+                0.26708275,
+                0.35169,
+                0.18185198,
+                0.2572803,
+                0.32425082,
+            ]
+        )
+        expected_slice_1 = np.array(
+            [
+                0.0,
+                0.18929192,
+                0.7068148,
+                0.07977328,
+                0.13444492,
+                0.5016247,
+                0.49761847,
+                0.2830933,
+                0.36412603,
+            ]
+        )
 
         assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4
         assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4
 
     def test_stable_diffusion_inpaint_legacy_k_lms(self):
         pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None)
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -479,24 +510,25 @@ def test_stable_diffusion_inpaint_legacy_k_lms(self):
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.29036117,
-            0.28907132,
-            0.32839334,
-            0.26510137,
-            0.2820784,
-            0.31148806,
-            0.29358387,
-            0.29515788,
-            0.28257304,
-        ])
+        expected_slice = np.array(
+            [
+                0.29036117,
+                0.28907132,
+                0.32839334,
+                0.26510137,
+                0.2820784,
+                0.31148806,
+                0.29358387,
+                0.29515788,
+                0.28257304,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.0001
 
     def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -504,42 +536,45 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.103,
-                    1.415,
-                    -0.02197,
-                    -0.5103,
-                    -0.5903,
-                    0.1953,
-                    0.75,
-                    0.3477,
-                    -1.356,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.001
+                expected_slice = np.array(
+                    [
+                        -0.103,
+                        1.415,
+                        -0.02197,
+                        -0.5103,
+                        -0.5903,
+                        0.1953,
+                        0.75,
+                        0.3477,
+                        -1.356,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.4802,
-                    1.154,
-                    0.628,
-                    0.2322,
-                    0.2593,
-                    -0.1455,
-                    0.7075,
-                    -0.1617,
-                    -0.5615,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.001
+                expected_slice = np.array(
+                    [
+                        0.4802,
+                        1.154,
+                        0.628,
+                        0.2322,
+                        0.2593,
+                        -0.1455,
+                        0.7075,
+                        -0.1617,
+                        -0.5615,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
@@ -577,20 +612,17 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_inpaint_pndm(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([[0.7330009, 0.80003107, 0.8268216],
-                                   [0.73606366, 0.801595, 0.8470554]])
+        expected_image = np.array([[0.7330009, 0.80003107, 0.8268216], [0.73606366, 0.801595, 0.8470554]])
         max_diff = np.abs(expected_image - image[0][0:2]).max()
         assert max_diff < 0.001
 
     def test_inpaint_ddim(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
@@ -599,36 +631,29 @@ def test_inpaint_ddim(self):
         expected_image = load_numpy(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy"
         )
-        expected_image = np.array([[0.7290994, 0.794852, 0.82096446],
-                                   [0.7330909, 0.79727536, 0.8420528]])
+        expected_image = np.array([[0.7290994, 0.794852, 0.82096446], [0.7330909, 0.79727536, 0.8420528]])
         max_diff = np.abs(expected_image - image[0][0:2]).max()
         assert max_diff < 0.001
 
     def test_inpaint_lms(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([[0.74595624, 0.81757987, 0.84589916],
-                                   [0.74728143, 0.81736475, 0.86543]])
+        expected_image = np.array([[0.74595624, 0.81757987, 0.84589916], [0.74728143, 0.81736475, 0.86543]])
         max_diff = np.abs(expected_image - image[0][0:2]).max()
         assert max_diff < 0.001
 
     def test_inpaint_dpm(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         inputs["num_inference_steps"] = 30
         image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([[0.7310472, 0.7970823, 0.8231524],
-                                   [0.7348697, 0.799358, 0.8439586]])
+        expected_image = np.array([[0.7310472, 0.7970823, 0.8231524], [0.7348697, 0.799358, 0.8439586]])
         max_diff = np.abs(expected_image - image[0][0:2]).max()
         assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index c367a6f472e51..0a6d49df4418f 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -22,20 +22,26 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionInstructPix2PixPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionInstructPix2PixPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin,
-                                                      unittest.TestCase):
+class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionInstructPix2PixPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
         "height",
@@ -54,7 +60,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -63,7 +70,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -74,10 +82,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -114,17 +122,19 @@ def test_stable_diffusion_pix2pix_default_case(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.24897021,
-            0.3813318,
-            0.15630311,
-            0.69198483,
-            0.7409521,
-            0.55128014,
-            0.5978868,
-            0.60921687,
-            0.47007012,
-        ])
+        expected_slice = np.array(
+            [
+                0.24897021,
+                0.3813318,
+                0.15630311,
+                0.69198483,
+                0.7409521,
+                0.55128014,
+                0.5978868,
+                0.60921687,
+                0.47007012,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_pix2pix_negative_prompt(self):
@@ -137,17 +147,19 @@ def test_stable_diffusion_pix2pix_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.27121854,
-            0.34936333,
-            0.12865198,
-            0.77894104,
-            0.81688535,
-            0.6136005,
-            0.62261313,
-            0.6386795,
-            0.5096967,
-        ])
+        expected_slice = np.array(
+            [
+                0.27121854,
+                0.34936333,
+                0.12865198,
+                0.77894104,
+                0.81688535,
+                0.6136005,
+                0.62261313,
+                0.6386795,
+                0.5096967,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_pix2pix_multiple_init_images(self):
@@ -164,23 +176,26 @@ def test_stable_diffusion_pix2pix_multiple_init_images(self):
         image_slice = image[-1, -3:, -3:, -1]
         assert image.shape == (2, 32, 32, 3)
 
-        expected_slice = np.array([
-            0.41508308,
-            0.41580454,
-            0.5588631,
-            0.32340443,
-            0.20930073,
-            0.35993075,
-            0.28470254,
-            0.38203996,
-            0.51769114,
-        ])
+        expected_slice = np.array(
+            [
+                0.41508308,
+                0.41580454,
+                0.5588631,
+                0.32340443,
+                0.20930073,
+                0.35993075,
+                0.28470254,
+                0.38203996,
+                0.51769114,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_pix2pix_euler(self):
         components = self.get_dummy_components()
         components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
         sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
@@ -189,17 +204,19 @@ def test_stable_diffusion_pix2pix_euler(self):
         slice = [round(x, 4) for x in image_slice.flatten().tolist()]
         print(",".join([str(x) for x in slice]))
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.26694882,
-            0.4288544,
-            0.21950376,
-            0.74369204,
-            0.6756442,
-            0.54577595,
-            0.5941435,
-            0.5603916,
-            0.51743454,
-        ])
+        expected_slice = np.array(
+            [
+                0.26694882,
+                0.4288544,
+                0.21950376,
+                0.74369204,
+                0.6756442,
+                0.54577595,
+                0.5941435,
+                0.5603916,
+                0.51743454,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
 
@@ -213,8 +230,7 @@ def tearDown(self):
 
     def get_inputs(self, seed=0):
         generator = paddle.Generator().manual_seed(seed=seed)
-        image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/example.jpg")
+        image = load_image("https://paddlenlp.bj.bcebos.com/data/images/example.jpg")
         inputs = {
             "prompt": "turn him into a cyborg",
             "image": image,
@@ -228,29 +244,33 @@ def get_inputs(self, seed=0):
 
     def test_stable_diffusion_pix2pix_default(self):
         pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None)
+            "timbrooks/instruct-pix2pix", safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.32138163,
-            0.32519442,
-            0.33127248,
-            0.32613453,
-            0.33317798,
-            0.33505,
-            0.32397628,
-            0.32964426,
-            0.32055843,
-        ])
+        expected_slice = np.array(
+            [
+                0.32138163,
+                0.32519442,
+                0.33127248,
+                0.32613453,
+                0.33317798,
+                0.33505,
+                0.32397628,
+                0.32964426,
+                0.32055843,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.001
 
     def test_stable_diffusion_pix2pix_k_lms(self):
         pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None)
+            "timbrooks/instruct-pix2pix", safety_checker=None
+        )
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -258,22 +278,25 @@ def test_stable_diffusion_pix2pix_k_lms(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.38934484,
-            0.3929934,
-            0.39973113,
-            0.4196028,
-            0.42386433,
-            0.43073824,
-            0.4267708,
-            0.43173674,
-            0.41896266,
-        ])
+        expected_slice = np.array(
+            [
+                0.38934484,
+                0.3929934,
+                0.39973113,
+                0.4196028,
+                0.42386433,
+                0.43073824,
+                0.4267708,
+                0.43173674,
+                0.41896266,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.001
 
     def test_stable_diffusion_pix2pix_ddim(self):
         pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None)
+            "timbrooks/instruct-pix2pix", safety_checker=None
+        )
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -281,24 +304,25 @@ def test_stable_diffusion_pix2pix_ddim(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.51511174,
-            0.5185677,
-            0.51326,
-            0.5176025,
-            0.514665,
-            0.519833,
-            0.52196854,
-            0.5121842,
-            0.52435803,
-        ])
+        expected_slice = np.array(
+            [
+                0.51511174,
+                0.5185677,
+                0.51326,
+                0.5176025,
+                0.514665,
+                0.519833,
+                0.52196854,
+                0.5121842,
+                0.52435803,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.001
 
     def test_stable_diffusion_pix2pix_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -306,28 +330,21 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556,
-                    1.227
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array([-0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556, 1.227])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537,
-                    1.239
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array([-0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537, 1.239])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
             "timbrooks/instruct-pix2pix",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
@@ -339,23 +356,24 @@ def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
         inputs = self.get_inputs()
         inputs["image"] = inputs["image"].resize((504, 504))
         model_id = "timbrooks/instruct-pix2pix"
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            model_id, safety_checker=None)
+        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         output = pipe(**inputs)
         image = output.images[0]
         image_slice = image[255:258, 383:386, -1]
         assert image.shape == (504, 504, 3)
-        expected_slice = np.array([
-            0.183373,
-            0.20458564,
-            0.2428664,
-            0.18245864,
-            0.22010538,
-            0.25757712,
-            0.19680199,
-            0.2185145,
-            0.24869373,
-        ])
+        expected_slice = np.array(
+            [
+                0.183373,
+                0.20458564,
+                0.2428664,
+                0.18245864,
+                0.22010538,
+                0.25757712,
+                0.19680199,
+                0.2185145,
+                0.24869373,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 176a0629de209..9f4ef2ff6f041 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -20,10 +20,15 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionPanoramaPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPanoramaPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -31,8 +36,7 @@
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin,
-                                               unittest.TestCase):
+class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionPanoramaPipeline
     params = TEXT_TO_IMAGE_PARAMS
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -47,7 +51,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler()
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -56,7 +61,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -67,10 +73,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -103,17 +109,19 @@ def test_stable_diffusion_panorama_default_case(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.28862977,
-            0.2441951,
-            0.2683525,
-            0.33122095,
-            0.28755113,
-            0.46375293,
-            0.254181,
-            0.30616608,
-            0.4785265,
-        ])
+        expected_slice = np.array(
+            [
+                0.28862977,
+                0.2441951,
+                0.2683525,
+                0.33122095,
+                0.28755113,
+                0.46375293,
+                0.254181,
+                0.30616608,
+                0.4785265,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     # override to speed the overall test timing up.
@@ -134,40 +142,45 @@ def test_stable_diffusion_panorama_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.28995812,
-            0.24463832,
-            0.2682391,
-            0.33033937,
-            0.2868188,
-            0.46267676,
-            0.25425047,
-            0.3066897,
-            0.47881347,
-        ])
+        expected_slice = np.array(
+            [
+                0.28995812,
+                0.24463832,
+                0.2682391,
+                0.33033937,
+                0.2868188,
+                0.46267676,
+                0.25425047,
+                0.3066897,
+                0.47881347,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_panorama_euler(self):
         components = self.get_dummy_components()
         components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
         sd_pipe = StableDiffusionPanoramaPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.32409406,
-            0.2660764,
-            0.41739762,
-            0.18994612,
-            0.32522476,
-            0.4869789,
-            0.13573006,
-            0.14128971,
-            0.32650158,
-        ])
+        expected_slice = np.array(
+            [
+                0.32409406,
+                0.2660764,
+                0.41739762,
+                0.18994612,
+                0.32522476,
+                0.4869789,
+                0.13573006,
+                0.14128971,
+                0.32650158,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_panorama_pndm(self):
@@ -201,32 +214,33 @@ def get_inputs(self, seed=0):
 
     def test_stable_diffusion_panorama_default(self):
         model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(
-            model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(
-            model_ckpt, scheduler=scheduler, safety_checker=None)
+        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 2048, 3)
-        expected_slice = np.array([
-            0.34261876,
-            0.3045774,
-            0.34545267,
-            0.33774284,
-            0.3431282,
-            0.33453488,
-            0.3094663,
-            0.32646674,
-            0.32534528,
-        ])
+        expected_slice = np.array(
+            [
+                0.34261876,
+                0.3045774,
+                0.34545267,
+                0.33774284,
+                0.3431282,
+                0.33453488,
+                0.3094663,
+                0.32646674,
+                0.32534528,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.01
 
     def test_stable_diffusion_panorama_k_lms(self):
         pipe = StableDiffusionPanoramaPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", safety_checker=None)
+            "stabilityai/stable-diffusion-2-base", safety_checker=None
+        )
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -234,24 +248,25 @@ def test_stable_diffusion_panorama_k_lms(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 2048, 3)
-        expected_slice = np.array([
-            0.0,
-            0.01188838,
-            0.02675471,
-            0.00534895,
-            0.02325496,
-            0.01234779,
-            0.0348064,
-            0.0,
-            0.02607787,
-        ])
+        expected_slice = np.array(
+            [
+                0.0,
+                0.01188838,
+                0.02675471,
+                0.00534895,
+                0.02325496,
+                0.01234779,
+                0.0348064,
+                0.0,
+                0.02607787,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.01
 
     def test_stable_diffusion_panorama_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -259,43 +274,43 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 256)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.7392851114273071,
-                    -0.16683124005794525,
-                    0.2063215672969818,
-                    -0.09840865433216095,
-                    0.18722617626190186,
-                    -0.08375956118106842,
-                    0.06995373964309692,
-                    -0.20892930030822754,
-                    -0.157355397939682,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.7392851114273071,
+                        -0.16683124005794525,
+                        0.2063215672969818,
+                        -0.09840865433216095,
+                        0.18722617626190186,
+                        -0.08375956118106842,
+                        0.06995373964309692,
+                        -0.20892930030822754,
+                        -0.157355397939682,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 256)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.7368452548980713,
-                    -0.16317462921142578,
-                    0.20289096236228943,
-                    -0.10271137207746506,
-                    0.1873130351305008,
-                    -0.08454630523920059,
-                    0.06944799423217773,
-                    -0.20782311260700226,
-                    -0.15696658194065094,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.7368452548980713,
+                        -0.16317462921142578,
+                        0.20289096236228943,
+                        -0.10271137207746506,
+                        0.1873130351305008,
+                        -0.08454630523920059,
+                        0.06944799423217773,
+                        -0.20782311260700226,
+                        -0.15696658194065094,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
         model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(
-            model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(
-            model_ckpt, scheduler=scheduler, safety_checker=None)
+        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index 0bee318686efc..d4787ab8eaa4d 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -21,14 +21,22 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMInverseScheduler, DDIMScheduler, DDPMScheduler,
-    EulerAncestralDiscreteScheduler, LMSDiscreteScheduler,
-    StableDiffusionPix2PixZeroPipeline, UNet2DConditionModel)
+    AutoencoderKL,
+    DDIMInverseScheduler,
+    DDIMScheduler,
+    DDPMScheduler,
+    EulerAncestralDiscreteScheduler,
+    LMSDiscreteScheduler,
+    StableDiffusionPix2PixZeroPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import load_image, slow
 from ppdiffusers.utils.testing_utils import load_pt, require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -39,8 +47,7 @@ def to_paddle(x):
 
 
 # we use SGD optimizer in this pipeline, so the result is not stable!
-class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin,
-                                                  unittest.TestCase):
+class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionPix2PixZeroPipeline
 
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
@@ -51,12 +58,14 @@ def setUpClass(cls):
         cls.source_embeds = to_paddle(
             load_pt(
                 "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt"
-            ))
+            )
+        )
 
         cls.target_embeds = to_paddle(
             load_pt(
                 "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt"
-            ))
+            )
+        )
 
     def get_dummy_components(self):
         paddle.seed(0)
@@ -68,7 +77,8 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler()
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -77,7 +87,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -88,10 +99,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -128,17 +139,19 @@ def test_stable_diffusion_pix2pix_zero_default_case(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.58762765,
-            0.17410329,
-            0.5067884,
-            0.39995563,
-            0.02808204,
-            0.35726422,
-            0.3250693,
-            0.3155224,
-            0.5268162,
-        ])
+        expected_slice = np.array(
+            [
+                0.58762765,
+                0.17410329,
+                0.5067884,
+                0.39995563,
+                0.02808204,
+                0.35726422,
+                0.3250693,
+                0.3155224,
+                0.5268162,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
@@ -151,40 +164,45 @@ def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.5042143,
-            0.34658563,
-            0.56157184,
-            0.3707891,
-            0.23746812,
-            0.47898933,
-            0.2702424,
-            0.36307925,
-            0.50807047,
-        ])
+        expected_slice = np.array(
+            [
+                0.5042143,
+                0.34658563,
+                0.56157184,
+                0.3707891,
+                0.23746812,
+                0.47898933,
+                0.2702424,
+                0.36307925,
+                0.50807047,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_stable_diffusion_pix2pix_zero_euler(self):
         components = self.get_dummy_components()
         components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+        )
         sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.4870367,
-            0.2677226,
-            0.37830275,
-            0.63265973,
-            0.32151344,
-            0.406371,
-            0.67513967,
-            0.5246535,
-            0.55954224,
-        ])
+        expected_slice = np.array(
+            [
+                0.4870367,
+                0.2677226,
+                0.37830275,
+                0.63265973,
+                0.32151344,
+                0.406371,
+                0.67513967,
+                0.5246535,
+                0.55954224,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_stable_diffusion_pix2pix_zero_ddpm(self):
@@ -196,17 +214,19 @@ def test_stable_diffusion_pix2pix_zero_ddpm(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.5899046,
-            0.17750263,
-            0.50616807,
-            0.39558932,
-            0.02976257,
-            0.35918522,
-            0.32376733,
-            0.31742626,
-            0.52768075,
-        ])
+        expected_slice = np.array(
+            [
+                0.5899046,
+                0.17750263,
+                0.50616807,
+                0.39558932,
+                0.02976257,
+                0.35918522,
+                0.32376733,
+                0.31742626,
+                0.52768075,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self):
@@ -218,14 +238,12 @@ def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self):
         assert images.shape == (1, 64, 64, 3)
         num_images_per_prompt = 2
         inputs = self.get_dummy_inputs()
-        images = sd_pipe(
-            **inputs, num_images_per_prompt=num_images_per_prompt).images
+        images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
         assert images.shape == (num_images_per_prompt, 64, 64, 3)
         batch_size = 2
         inputs = self.get_dummy_inputs()
         inputs["prompt"] = [inputs["prompt"]] * batch_size
-        images = sd_pipe(
-            **inputs, num_images_per_prompt=num_images_per_prompt).images
+        images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
         assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
 
     # Non-determinism caused by the scheduler optimizing the latent inputs during inference
@@ -245,14 +263,12 @@ def tearDown(self):
     @classmethod
     def setUpClass(cls):
         cls.source_embeds = to_paddle(
-            load_pt(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt"
-            ))
+            load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt")
+        )
 
         cls.target_embeds = to_paddle(
-            load_pt(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt"
-            ))
+            load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt")
+        )
 
     def get_inputs(self, seed=0):
         generator = paddle.Generator().manual_seed(seed=seed)
@@ -272,46 +288,48 @@ def test_stable_diffusion_pix2pix_zero_default(self):
         pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.8129883,
-            0.81933594,
-            0.80371094,
-            0.8105469,
-            0.8076172,
-            0.80566406,
-            0.81884766,
-            0.8330078,
-            0.82470703,
-        ])
+        expected_slice = np.array(
+            [
+                0.8129883,
+                0.81933594,
+                0.80371094,
+                0.8105469,
+                0.8076172,
+                0.80566406,
+                0.81884766,
+                0.8330078,
+                0.82470703,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.05
 
     def test_stable_diffusion_pix2pix_zero_k_lms(self):
         pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711])
+        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711])
         assert np.abs(expected_slice - image_slice).max() < 0.05
 
     def test_stable_diffusion_pix2pix_zero_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -319,42 +337,45 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.93444633,
-                    1.1613252,
-                    0.7700033,
-                    0.18847837,
-                    -1.17147,
-                    0.07546477,
-                    0.06142269,
-                    -0.8030814,
-                    -0.59692276,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.93444633,
+                        1.1613252,
+                        0.7700033,
+                        0.18847837,
+                        -1.17147,
+                        0.07546477,
+                        0.06142269,
+                        -0.8030814,
+                        -0.59692276,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.93180454,
-                    1.1606954,
-                    0.7721853,
-                    0.18454231,
-                    -1.1679069,
-                    0.07357024,
-                    0.06213593,
-                    -0.80399096,
-                    -0.5937987,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.93180454,
+                        1.1606954,
+                        0.7721853,
+                        0.18454231,
+                        -1.1679069,
+                        0.07357024,
+                        0.06213593,
+                        -0.80399096,
+                        -0.5937987,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
@@ -385,38 +406,29 @@ def test_stable_diffusion_pix2pix_inversion(self):
         pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
-        pipe.inverse_scheduler = DDIMScheduler.from_config(
-            pipe.scheduler.config)
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(
-            pipe.scheduler.config)
+            paddle_dtype=paddle.float16,
+        )
+        pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
         caption = "a photography of a cat with flowers"
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        output = pipe.invert(
-            caption,
-            image=self.raw_image,
-            generator=generator,
-            num_inference_steps=10)
+        output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
         inv_latents = output[0]
         image_slice = inv_latents[0, -3:, -3:, -1].flatten()
         assert tuple(inv_latents.shape) == (1, 4, 64, 64)
-        expected_slice = np.array([
-            0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498,
-            -0.8599
-        ])
+        expected_slice = np.array([0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498, -0.8599])
         assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 0.05
 
     def test_stable_diffusion_pix2pix_full(self):
         pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
-        pipe.inverse_scheduler = DDIMScheduler.from_config(
-            pipe.scheduler.config)
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(
-            pipe.scheduler.config)
+            paddle_dtype=paddle.float16,
+        )
+        pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
         caption = "a photography of a cat with flowers"
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
@@ -444,19 +456,22 @@ def test_stable_diffusion_pix2pix_full(self):
             generator=generator,
             latents=inv_latents,
             negative_prompt=caption,
-            output_type="np", ).images
+            output_type="np",
+        ).images
 
         image_slice = image[0, -3:, -3:, -1].flatten()
-        expected_slice = np.array([
-            0.64208984375,
-            0.65673828125,
-            0.650390625,
-            0.6513671875,
-            0.646484375,
-            0.6650390625,
-            0.6513671875,
-            0.6640625,
-            0.66796875,
-        ])
+        expected_slice = np.array(
+            [
+                0.64208984375,
+                0.65673828125,
+                0.650390625,
+                0.6513671875,
+                0.646484375,
+                0.6650390625,
+                0.6513671875,
+                0.6640625,
+                0.66796875,
+            ]
+        )
         max_diff = np.abs(image_slice - expected_slice).max()
         assert max_diff < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index d04d08d9bb18f..aa60def2d023c 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -20,8 +20,12 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         StableDiffusionSAGPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    StableDiffusionSAGPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -29,8 +33,7 @@
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin,
-                                          unittest.TestCase):
+class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionSAGPipeline
     test_cpu_offload = False
     params = TEXT_TO_IMAGE_PARAMS
@@ -46,13 +49,15 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -60,7 +65,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -71,10 +77,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -109,8 +115,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_stable_diffusion_1(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4")
+        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
         sag_pipe.set_progress_bar_config(disable=None)
         prompt = "."
         generator = paddle.Generator().manual_seed(0)
@@ -120,26 +125,28 @@ def test_stable_diffusion_1(self):
             guidance_scale=7.5,
             sag_scale=1.0,
             num_inference_steps=20,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.7477613,
-            0.76045597,
-            0.7464366,
-            0.778965,
-            0.75718963,
-            0.7487634,
-            0.77530396,
-            0.77426934,
-            0.7749926,
-        ])
+        expected_slice = np.array(
+            [
+                0.7477613,
+                0.76045597,
+                0.7464366,
+                0.778965,
+                0.75718963,
+                0.7487634,
+                0.77530396,
+                0.77426934,
+                0.7749926,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_stable_diffusion_2(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base")
+        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
         sag_pipe.set_progress_bar_config(disable=None)
         prompt = "."
         generator = paddle.Generator().manual_seed(0)
@@ -149,19 +156,22 @@ def test_stable_diffusion_2(self):
             guidance_scale=7.5,
             sag_scale=1.0,
             num_inference_steps=20,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.8771595,
-            0.8521123,
-            0.8644101,
-            0.8680052,
-            0.8700466,
-            0.8897612,
-            0.87766427,
-            0.8636212,
-            0.86829203,
-        ])
+        expected_slice = np.array(
+            [
+                0.8771595,
+                0.8521123,
+                0.8644101,
+                0.8680052,
+                0.8700466,
+                0.8897612,
+                0.87766427,
+                0.8636212,
+                0.86829203,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 07d1870d2afd5..1e95848760207 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -21,10 +21,17 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline,
-    UNet2DConditionModel, logging)
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
 from ppdiffusers.utils import load_numpy, nightly, slow
 from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
 
@@ -49,13 +56,15 @@ def get_dummy_components(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
             attention_head_dim=(2, 4),
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -64,7 +73,8 @@ def get_dummy_components(self):
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
-            sample_size=128, )
+            sample_size=128,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -77,10 +87,10 @@ def get_dummy_components(self):
             pad_token_id=1,
             vocab_size=1000,
             hidden_act="gelu",
-            projection_dim=512, )
+            projection_dim=512,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -112,17 +122,19 @@ def test_stable_diffusion_ddim(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.3505131,
-            0.36318004,
-            0.39201266,
-            0.12107915,
-            0.27704653,
-            0.40363187,
-            0.09379572,
-            0.16225743,
-            0.36048344,
-        ])
+        expected_slice = np.array(
+            [
+                0.3505131,
+                0.36318004,
+                0.39201266,
+                0.12107915,
+                0.27704653,
+                0.40363187,
+                0.09379572,
+                0.16225743,
+                0.36048344,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_pndm(self):
@@ -134,122 +146,127 @@ def test_stable_diffusion_pndm(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.25144678,
-            0.35438284,
-            0.3613463,
-            0.11020249,
-            0.3101831,
-            0.42739886,
-            0.1142821,
-            0.17371863,
-            0.35148838,
-        ])
+        expected_slice = np.array(
+            [
+                0.25144678,
+                0.35438284,
+                0.3613463,
+                0.11020249,
+                0.3101831,
+                0.42739886,
+                0.1142821,
+                0.17371863,
+                0.35148838,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_k_lms(self):
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components[
-            "scheduler"].config)
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.3676631,
-            0.38155898,
-            0.4023114,
-            0.11294425,
-            0.2891888,
-            0.40432304,
-            0.08882684,
-            0.1466648,
-            0.33633134,
-        ])
+        expected_slice = np.array(
+            [
+                0.3676631,
+                0.38155898,
+                0.4023114,
+                0.11294425,
+                0.2891888,
+                0.40432304,
+                0.08882684,
+                0.1466648,
+                0.33633134,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_k_euler_ancestral(self):
         components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(
-            components["scheduler"].config)
+        components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.36797395,
-            0.38137895,
-            0.40199342,
-            0.11330777,
-            0.2886864,
-            0.40422022,
-            0.08929691,
-            0.14658183,
-            0.3363046,
-        ])
+        expected_slice = np.array(
+            [
+                0.36797395,
+                0.38137895,
+                0.40199342,
+                0.11330777,
+                0.2886864,
+                0.40422022,
+                0.08929691,
+                0.14658183,
+                0.3363046,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_k_euler(self):
         components = self.get_dummy_components()
-        components["scheduler"] = EulerDiscreteScheduler.from_config(components[
-            "scheduler"].config)
+        components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.36766386,
-            0.3815591,
-            0.40231153,
-            0.11294428,
-            0.28918856,
-            0.40432304,
-            0.08882678,
-            0.14666462,
-            0.3363313,
-        ])
+        expected_slice = np.array(
+            [
+                0.36766386,
+                0.3815591,
+                0.40231153,
+                0.11294428,
+                0.28918856,
+                0.40432304,
+                0.08882678,
+                0.14666462,
+                0.3363313,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_long_prompt(self):
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components[
-            "scheduler"].config)
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         sd_pipe = StableDiffusionPipeline(**components)
         sd_pipe.set_progress_bar_config(disable=None)
         do_classifier_free_guidance = True
         negative_prompt = None
         num_images_per_prompt = 1
-        logger = logging.get_logger(
-            "ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
         prompt = 25 * "@"
         with CaptureLogger(logger) as cap_logger_3:
             text_embeddings_3 = sd_pipe._encode_prompt(
                 prompt,
                 num_images_per_prompt,
                 do_classifier_free_guidance,
-                negative_prompt, )
+                negative_prompt,
+            )
         prompt = 100 * "@"
         with CaptureLogger(logger) as cap_logger:
             text_embeddings = sd_pipe._encode_prompt(
                 prompt,
                 num_images_per_prompt,
                 do_classifier_free_guidance,
-                negative_prompt, )
+                negative_prompt,
+            )
         negative_prompt = "Hello"
         with CaptureLogger(logger) as cap_logger_2:
             text_embeddings_2 = sd_pipe._encode_prompt(
                 prompt,
                 num_images_per_prompt,
                 do_classifier_free_guidance,
-                negative_prompt, )
-        assert (text_embeddings_3.shape == text_embeddings_2.shape ==
-                text_embeddings.shape)
+                negative_prompt,
+            )
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
         assert text_embeddings.shape[1] == 77
         assert cap_logger.out == cap_logger_2.out
         assert cap_logger.out.count("@") == 25
@@ -279,68 +296,71 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_stable_diffusion_default_ddim(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base")
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.49493,
-            0.47896,
-            0.40798,
-            0.54214,
-            0.53212,
-            0.48202,
-            0.47656,
-            0.46329,
-            0.48506,
-        ])
+        expected_slice = np.array(
+            [
+                0.49493,
+                0.47896,
+                0.40798,
+                0.54214,
+                0.53212,
+                0.48202,
+                0.47656,
+                0.46329,
+                0.48506,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_pndm(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base")
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
         pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.49493,
-            0.47896,
-            0.40798,
-            0.54214,
-            0.53212,
-            0.48202,
-            0.47656,
-            0.46329,
-            0.48506,
-        ])
+        expected_slice = np.array(
+            [
+                0.49493,
+                0.47896,
+                0.40798,
+                0.54214,
+                0.53212,
+                0.48202,
+                0.47656,
+                0.46329,
+                0.48506,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     def test_stable_diffusion_k_lms(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base")
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.1044,
-            0.13115,
-            0.111,
-            0.10141,
-            0.1144,
-            0.07215,
-            0.11332,
-            0.09693,
-            0.10006,
-        ])
+        expected_slice = np.array(
+            [
+                0.1044,
+                0.13115,
+                0.111,
+                0.10141,
+                0.1144,
+                0.07215,
+                0.11332,
+                0.09693,
+                0.10006,
+            ]
+        )
         assert np.abs(image_slice - expected_slice).max() < 0.0001
 
     # def test_stable_diffusion_attention_slicing(self):
@@ -363,8 +383,7 @@ def test_stable_diffusion_k_lms(self):
     def test_stable_diffusion_text2img_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -372,40 +391,43 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.3862,
-                    -0.4507,
-                    -1.1729,
-                    0.0686,
-                    -1.1045,
-                    0.7124,
-                    -1.8301,
-                    0.1903,
-                    1.2773,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        -0.3862,
+                        -0.4507,
+                        -1.1729,
+                        0.0686,
+                        -1.1045,
+                        0.7124,
+                        -1.8301,
+                        0.1903,
+                        1.2773,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    0.272,
-                    -0.1863,
-                    -0.7383,
-                    -0.5029,
-                    -0.7534,
-                    0.397,
-                    -0.7646,
-                    0.4468,
-                    1.2686,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        0.272,
+                        -0.1863,
+                        -0.7383,
+                        -0.5029,
+                        -0.7534,
+                        0.397,
+                        -0.7646,
+                        0.4468,
+                        1.2686,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16)
+            "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(dtype="float16")
@@ -437,8 +459,7 @@ def get_inputs(self, dtype="float32", seed=0):
         return inputs
 
     def test_stable_diffusion_2_0_default_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
@@ -449,8 +470,7 @@ def test_stable_diffusion_2_0_default_ddim(self):
         assert max_diff < 0.01
 
     def test_stable_diffusion_2_1_default_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
@@ -461,8 +481,7 @@ def test_stable_diffusion_2_1_default_pndm(self):
         assert max_diff < 0.01
 
     def test_stable_diffusion_ddim(self):  # not pass
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
@@ -474,10 +493,8 @@ def test_stable_diffusion_ddim(self):  # not pass
         assert max_diff < 0.01
 
     def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
@@ -488,10 +505,8 @@ def test_stable_diffusion_lms(self):
         assert max_diff < 0.01
 
     def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         image = sd_pipe(**inputs).images[0]
@@ -502,10 +517,8 @@ def test_stable_diffusion_euler(self):
         assert max_diff < 0.01
 
     def test_stable_diffusion_dpm(self):  # not pass
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         inputs = self.get_inputs()
         inputs["num_inference_steps"] = 25
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 56aa066eb5a02..c63bfcf099735 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -20,9 +20,12 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         StableDiffusionAttendAndExcitePipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    StableDiffusionAttendAndExcitePipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import load_numpy, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -30,8 +33,7 @@
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin,
-                                                      unittest.TestCase):
+class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionAttendAndExcitePipeline
     test_attention_slicing = False
     params = TEXT_TO_IMAGE_PARAMS
@@ -49,13 +51,15 @@ def get_dummy_components(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
             attention_head_dim=(2, 4),
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -64,7 +68,8 @@ def get_dummy_components(self):
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
-            sample_size=128, )
+            sample_size=128,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -77,10 +82,10 @@ def get_dummy_components(self):
             pad_token_id=1,
             vocab_size=1000,
             hidden_act="gelu",
-            projection_dim=512, )
+            projection_dim=512,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -103,9 +108,7 @@ def get_dummy_inputs(self, seed=0):
             "guidance_scale": 6.0,
             "output_type": "numpy",
             "max_iter_to_alter": 2,
-            "thresholds": {
-                (0): 0.7
-            },
+            "thresholds": {(0): 0.7},
         }
         return inputs
 
@@ -117,17 +120,19 @@ def test_inference(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         self.assertEqual(image.shape, (1, 64, 64, 3))
-        expected_slice = np.array([
-            0.33271241188049316,
-            0.3123358190059662,
-            0.44427454471588135,
-            0.08615309000015259,
-            0.26107650995254517,
-            0.4551312029361725,
-            0.06545555591583252,
-            0.1626836657524109,
-            0.3982071578502655,
-        ])
+        expected_slice = np.array(
+            [
+                0.33271241188049316,
+                0.3123358190059662,
+                0.44427454471588135,
+                0.08615309000015259,
+                0.26107650995254517,
+                0.4551312029361725,
+                0.06545555591583252,
+                0.1626836657524109,
+                0.3982071578502655,
+            ]
+        )
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 0.001)
 
@@ -149,7 +154,8 @@ def test_attend_and_excite_fp16(self):
         pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
 
         prompt = "a painting of an elephant with glasses"
         token_indices = [5, 7]
@@ -160,7 +166,8 @@ def test_attend_and_excite_fp16(self):
             generator=generator,
             num_inference_steps=5,
             max_iter_to_alter=5,
-            output_type="numpy", ).images[0]
+            output_type="numpy",
+        ).images[0]
         expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
         )
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index 077ed16dba212..240b7ae56d4da 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -20,30 +20,39 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
-                                    CLIPTokenizer, DPTConfig,
-                                    DPTForDepthEstimation, DPTImageProcessor)
+from paddlenlp.transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    DPTConfig,
+    DPTForDepthEstimation,
+    DPTImageProcessor,
+)
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, StableDiffusionDepth2ImgPipeline,
-                         UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionDepth2ImgPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, nightly, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin,
-                                                unittest.TestCase):
+class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionDepth2ImgPipeline
     test_save_load_optional_components = False
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
 
     def get_dummy_components(self):
@@ -58,7 +67,8 @@ def get_dummy_components(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
             attention_head_dim=(2, 4),
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -67,7 +77,8 @@ def get_dummy_components(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -78,10 +89,10 @@ def get_dummy_components(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         backbone_config = {
             "global_padding": "same",
             "layer_type": "bottleneck",
@@ -107,10 +118,10 @@ def get_dummy_components(self):
             initializer_range=0.02,
             is_hybrid=True,
             backbone_config=backbone_config,
-            backbone_featmap_shape=[1, 384, 24, 24], )
+            backbone_featmap_shape=[1, 384, 24, 24],
+        )
         depth_estimator = DPTForDepthEstimation(depth_estimator_config)
-        feature_extractor = DPTImageProcessor.from_pretrained(
-            "hf-internal-testing/tiny-random-DPTForDepthEstimation")
+        feature_extractor = DPTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-DPTForDepthEstimation")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -146,8 +157,7 @@ def test_save_load_local(self):
         output = pipe(**inputs)[0]
         with tempfile.TemporaryDirectory() as tmpdir:
             pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(
-                tmpdir, from_diffusers=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
             pipe_loaded.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         output_loaded = pipe_loaded(**inputs)[0]
@@ -215,17 +225,19 @@ def test_stable_diffusion_depth2img_default_case(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.35397637,
-            0.23190483,
-            0.20131412,
-            0.27374774,
-            0.265134,
-            0.4502194,
-            0.26852018,
-            0.37504935,
-            0.43135768,
-        ])
+        expected_slice = np.array(
+            [
+                0.35397637,
+                0.23190483,
+                0.20131412,
+                0.27374774,
+                0.265134,
+                0.4502194,
+                0.26852018,
+                0.37504935,
+                0.43135768,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_depth2img_negative_prompt(self):
@@ -238,17 +250,19 @@ def test_stable_diffusion_depth2img_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.40259343,
-            0.37764466,
-            0.3936328,
-            0.3628915,
-            0.48100996,
-            0.59685427,
-            0.22927544,
-            0.45186657,
-            0.46950823,
-        ])
+        expected_slice = np.array(
+            [
+                0.40259343,
+                0.37764466,
+                0.3936328,
+                0.3628915,
+                0.48100996,
+                0.59685427,
+                0.22927544,
+                0.45186657,
+                0.46950823,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_depth2img_multiple_init_images(self):
@@ -261,17 +275,19 @@ def test_stable_diffusion_depth2img_multiple_init_images(self):
         image = pipe(**inputs).images
         image_slice = image[-1, -3:, -3:, -1]
         assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([
-            0.8169553,
-            0.4573238,
-            0.27039874,
-            0.60622,
-            0.35670877,
-            0.39508212,
-            0.56803817,
-            0.5341117,
-            0.44428858,
-        ])
+        expected_slice = np.array(
+            [
+                0.8169553,
+                0.4573238,
+                0.27039874,
+                0.60622,
+                0.35670877,
+                0.39508212,
+                0.56803817,
+                0.5341117,
+                0.44428858,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
     def test_stable_diffusion_depth2img_num_images_per_prompt(self):
@@ -288,14 +304,12 @@ def test_stable_diffusion_depth2img_num_images_per_prompt(self):
         assert images.shape == (batch_size, 32, 32, 3)
         num_images_per_prompt = 2
         inputs = self.get_dummy_inputs()
-        images = pipe(
-            **inputs, num_images_per_prompt=num_images_per_prompt).images
+        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
         assert images.shape == (num_images_per_prompt, 32, 32, 3)
         batch_size = 2
         inputs = self.get_dummy_inputs()
         inputs["prompt"] = [inputs["prompt"]] * batch_size
-        images = pipe(
-            **inputs, num_images_per_prompt=num_images_per_prompt).images
+        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
         assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
 
     def test_stable_diffusion_depth2img_pil(self):
@@ -305,17 +319,19 @@ def test_stable_diffusion_depth2img_pil(self):
         inputs = self.get_dummy_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([
-            0.35397637,
-            0.23190483,
-            0.20131412,
-            0.27374774,
-            0.265134,
-            0.4502194,
-            0.26852018,
-            0.37504935,
-            0.43135768,
-        ])
+        expected_slice = np.array(
+            [
+                0.35397637,
+                0.23190483,
+                0.20131412,
+                0.27374774,
+                0.265134,
+                0.4502194,
+                0.26852018,
+                0.37504935,
+                0.43135768,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
 
 
@@ -345,7 +361,8 @@ def get_inputs(self, dtype="float32", seed=0):
 
     def test_stable_diffusion_depth2img_pipeline_default(self):
         pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None)
+            "stabilityai/stable-diffusion-2-depth", safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs()
@@ -353,22 +370,25 @@ def test_stable_diffusion_depth2img_pipeline_default(self):
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 480, 640, 3)
         # expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.826, 0.7747, 0.7421])
-        expected_slice = np.array([
-            0.75446224,
-            0.746921,
-            0.7595095,
-            0.8161169,
-            0.8059271,
-            0.7999228,
-            0.9052905,
-            0.879215,
-            0.8690305,
-        ])
+        expected_slice = np.array(
+            [
+                0.75446224,
+                0.746921,
+                0.7595095,
+                0.8161169,
+                0.8059271,
+                0.7999228,
+                0.9052905,
+                0.879215,
+                0.8690305,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.1
 
     def test_stable_diffusion_depth2img_pipeline_k_lms(self):
         pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None)
+            "stabilityai/stable-diffusion-2-depth", safety_checker=None
+        )
         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -377,22 +397,25 @@ def test_stable_diffusion_depth2img_pipeline_k_lms(self):
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 480, 640, 3)
         # expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.637, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306])
-        expected_slice = np.array([
-            0.6395747,
-            0.64879197,
-            0.6566683,
-            0.6438427,
-            0.6707787,
-            0.63587487,
-            0.66576767,
-            0.62180007,
-            0.6628648,
-        ])
+        expected_slice = np.array(
+            [
+                0.6395747,
+                0.64879197,
+                0.6566683,
+                0.6438427,
+                0.6707787,
+                0.63587487,
+                0.66576767,
+                0.62180007,
+                0.6628648,
+            ]
+        )
         assert np.abs(expected_slice - image_slice).max() < 0.1
 
     def test_stable_diffusion_depth2img_pipeline_ddim(self):
         pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None)
+            "stabilityai/stable-diffusion-2-depth", safety_checker=None
+        )
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -401,25 +424,26 @@ def test_stable_diffusion_depth2img_pipeline_ddim(self):
         image_slice = image[0, 253:256, 253:256, -1].flatten()
         assert image.shape == (1, 480, 640, 3)
         # expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.642, 0.6522, 0.6555, 0.6436])
-        expected_slice = np.array([
-            0.6283968,
-            0.6419119,
-            0.6295293,
-            0.63652724,
-            0.6420511,
-            0.61574477,
-            0.62251365,
-            0.65826833,
-            0.6480877,
-        ])
+        expected_slice = np.array(
+            [
+                0.6283968,
+                0.6419119,
+                0.6295293,
+                0.63652724,
+                0.6420511,
+                0.61574477,
+                0.62251365,
+                0.65826833,
+                0.6480877,
+            ]
+        )
 
         assert np.abs(expected_slice - image_slice).max() < 0.15
 
     def test_stable_diffusion_depth2img_intermediate_state(self):
         number_of_steps = 0
 
-        def callback_fn(step: int, timestep: int,
-                        latents: paddle.Tensor) -> None:
+        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -427,25 +451,27 @@ def callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 60, 80)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -1.148,
-                    -0.2147,
-                    -0.618,
-                    -2.48,
-                    -2.348,
-                    0.3945,
-                    -2.05,
-                    -1.566,
-                    -1.52,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.1
+                expected_slice = np.array(
+                    [
+                        -1.148,
+                        -0.2147,
+                        -0.618,
+                        -2.48,
+                        -2.348,
+                        0.3945,
+                        -2.05,
+                        -1.566,
+                        -1.52,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.1
 
         callback_fn.has_been_called = False
         pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
             "stabilityai/stable-diffusion-2-depth",
             safety_checker=None,
-            paddle_dtype=paddle.float16, )
+            paddle_dtype=paddle.float16,
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         inputs = self.get_inputs(dtype="float16")
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 6e0d5f33a5bdc..a926f2ed14718 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -22,18 +22,23 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, PNDMScheduler,
-                         StableDiffusionInpaintPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    PNDMScheduler,
+    StableDiffusionInpaintPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image
 from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_INPAINTING_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin,
-                                               unittest.TestCase):
+class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionInpaintPipeline
     params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
     batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
@@ -50,7 +55,8 @@ def get_dummy_components(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
             attention_head_dim=(2, 4),
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         scheduler = PNDMScheduler(skip_prk_steps=True)
         paddle.seed(0)
         vae = AutoencoderKL(
@@ -60,7 +66,8 @@ def get_dummy_components(self):
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
-            sample_size=128, )
+            sample_size=128,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -73,10 +80,10 @@ def get_dummy_components(self):
             pad_token_id=1,
             vocab_size=1000,
             hidden_act="gelu",
-            projection_dim=512, )
+            projection_dim=512,
+        )
         text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -91,11 +98,8 @@ def get_dummy_components(self):
     def get_dummy_inputs(self, seed=0):
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
         image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (64, 64))
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (64, 64)))
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
         generator = paddle.Generator().manual_seed(seed)
 
         inputs = {
@@ -117,17 +121,19 @@ def test_stable_diffusion_inpaint(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.58470726,
-            0.49302375,
-            0.3954028,
-            0.4068969,
-            0.33668613,
-            0.50350493,
-            0.34411103,
-            0.25261122,
-            0.4531455,
-        ])
+        expected_slice = np.array(
+            [
+                0.58470726,
+                0.49302375,
+                0.3954028,
+                0.4068969,
+                0.33668613,
+                0.50350493,
+                0.34411103,
+                0.25261122,
+                0.4531455,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
 
@@ -151,8 +157,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
         #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/yellow_cat_sitting_on_a_park_bench.npy'
         #     )
         model_id = "stabilityai/stable-diffusion-2-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id, safety_checker=None)
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
@@ -162,7 +167,8 @@ def test_stable_diffusion_inpaint_pipeline(self):
             image=init_image,
             mask_image=mask_image,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images[0]
         assert image.shape == (512, 512, 3)
         image = image[-3:, -3:, -1]
@@ -186,7 +192,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
         #     )
         model_id = "stabilityai/stable-diffusion-2-inpainting"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id, paddle_dtype=paddle.float16, safety_checker=None)
+            model_id, paddle_dtype=paddle.float16, safety_checker=None
+        )
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
@@ -196,7 +203,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
             image=init_image,
             mask_image=mask_image,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images[0]
         assert image.shape == (512, 512, 3)
         image = image[-3:, -3:, -1]
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 0224ae1e8b294..ec93a578bbaf2 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -21,19 +21,24 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, EulerDiscreteScheduler,
-                         StableDiffusionLatentUpscalePipeline,
-                         StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    StableDiffusionLatentUpscalePipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin,
-                                                    unittest.TestCase):
+class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionLatentUpscalePipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
         "height",
@@ -42,9 +47,7 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin,
         "negative_prompt_embeds",
         "prompt_embeds",
     }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "num_images_per_prompt"
-    }
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"}
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
     test_cpu_offload = False
 
@@ -53,8 +56,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 4
         sizes = 16, 16
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     def get_dummy_components(self):
@@ -72,7 +74,8 @@ def get_dummy_components(self):
                 "KDownBlock2D",
                 "KCrossAttnDownBlock2D",
                 "KCrossAttnDownBlock2D",
-                "KCrossAttnDownBlock2D", ),
+                "KCrossAttnDownBlock2D",
+            ),
             in_channels=8,
             mid_block_type=None,
             only_cross_attention=False,
@@ -84,7 +87,9 @@ def get_dummy_components(self):
                 "KCrossAttnUpBlock2D",
                 "KCrossAttnUpBlock2D",
                 "KCrossAttnUpBlock2D",
-                "KUpBlock2D", ), )
+                "KUpBlock2D",
+            ),
+        )
         vae = AutoencoderKL(
             block_out_channels=[32, 32, 64, 64],
             in_channels=3,
@@ -101,7 +106,8 @@ def get_dummy_components(self):
                 "UpDecoderBlock2D",
                 "UpDecoderBlock2D",
             ],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         scheduler = EulerDiscreteScheduler(prediction_type="sample")
         text_config = CLIPTextConfig(
             bos_token_id=0,
@@ -114,10 +120,10 @@ def get_dummy_components(self):
             pad_token_id=1,
             vocab_size=1000,
             hidden_act="quick_gelu",
-            projection_dim=512, )
+            projection_dim=512,
+        )
         text_encoder = CLIPTextModel(text_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": model.eval(),
             "vae": vae.eval(),
@@ -147,17 +153,19 @@ def test_inference(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
         self.assertEqual(image.shape, (1, 256, 256, 3))
-        expected_slice = np.array([
-            0.5665861368179321,
-            0.7449524402618408,
-            0.0,
-            0.1325536072254181,
-            0.4274534583091736,
-            0.0,
-            0.0,
-            0.14426982402801514,
-            0.0,
-        ])
+        expected_slice = np.array(
+            [
+                0.5665861368179321,
+                0.7449524402618408,
+                0.0,
+                0.1325536072254181,
+                0.4274534583091736,
+                0.0,
+                0.0,
+                0.14426982402801514,
+                0.0,
+            ]
+        )
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 0.001)
 
@@ -175,25 +183,23 @@ def tearDown(self):
 
     def test_latent_upscaler_fp16(self):
         generator = paddle.Generator().manual_seed(seed=33)
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
         pipe.to("gpu")
         upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-            "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16)
+            "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16
+        )
         upscaler.to("gpu")
 
-        prompt = (
-            "a photo of an astronaut high resolution, unreal engine, ultra realistic"
-        )
-        low_res_latents = pipe(
-            prompt, generator=generator, output_type="latent").images
+        prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
+        low_res_latents = pipe(prompt, generator=generator, output_type="latent").images
         image = upscaler(
             prompt=prompt,
             image=low_res_latents,
             num_inference_steps=20,
             guidance_scale=0,
             generator=generator,
-            output_type="np", ).images[0]
+            output_type="np",
+        ).images[0]
         # invalid expected_image
         # expected_image = load_numpy(
         #     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy"
@@ -209,7 +215,8 @@ def test_latent_upscaler_fp16(self):
     def test_latent_upscaler_fp16_image(self):
         generator = paddle.Generator().manual_seed(seed=33)
         upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-            "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16)
+            "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16
+        )
         upscaler.to("gpu")
 
         prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"
@@ -222,7 +229,8 @@ def test_latent_upscaler_fp16_image(self):
             num_inference_steps=20,
             guidance_scale=0,
             generator=generator,
-            output_type="np", ).images[0]
+            output_type="np",
+        ).images[0]
         # invalid expected_image
         # expected_image = load_numpy(
         #     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy"
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
index ca4e467ebdca2..35a135bc747e3 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -22,8 +22,13 @@
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from PIL import Image
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         StableDiffusionUpscalePipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    StableDiffusionUpscalePipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import floats_tensor, load_image, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -39,8 +44,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = (32, 32)
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     @property
@@ -55,15 +59,16 @@ def dummy_cond_unet_upscale(self):
             down_block_types=(
                 "DownBlock2D",
                 "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D", ),
-            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D",
-                            "UpBlock2D"),
+                "CrossAttnDownBlock2D",
+            ),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
             # SD2-specific config below
             attention_head_dim=8,
             use_linear_projection=True,
             only_cross_attention=(True, True, False),
-            num_class_embeds=100, )
+            num_class_embeds=100,
+        )
         return model
 
     @property
@@ -78,10 +83,9 @@ def dummy_vae(self):
                 "DownEncoderBlock2D",
                 "DownEncoderBlock2D",
             ],
-            up_block_types=[
-                "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"
-            ],
-            latent_channels=4, )
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
         return model
 
     @property
@@ -99,7 +103,8 @@ def dummy_text_encoder(self):
             vocab_size=1000,
             # SD2-specific config below
             hidden_act="gelu",
-            projection_dim=512, )
+            projection_dim=512,
+        )
         return CLIPTextModel(config).eval()
 
     def test_stable_diffusion_upscale(self):
@@ -108,11 +113,9 @@ def test_stable_diffusion_upscale(self):
         scheduler = DDIMScheduler(prediction_type="v_prediction")
         vae = self.dummy_vae
         text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (64, 64))
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
         sd_pipe = StableDiffusionUpscalePipeline(
             unet=unet,
             low_res_scheduler=low_res_scheduler,
@@ -120,7 +123,8 @@ def test_stable_diffusion_upscale(self):
             vae=vae,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
-            max_noise_level=350, )
+            max_noise_level=350,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -131,7 +135,8 @@ def test_stable_diffusion_upscale(self):
             guidance_scale=6.0,
             noise_level=20,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -142,26 +147,27 @@ def test_stable_diffusion_upscale(self):
             noise_level=20,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         expected_height_width = low_res_image.size[0] * 4
-        assert image.shape == (1, expected_height_width, expected_height_width,
-                               3)
-        expected_slice = np.array([
-            0.0,
-            0.0,
-            0.3616839,
-            0.0,
-            0.04877859,
-            0.59195685,
-            0.23902711,
-            0.00838843,
-            0.5172206,
-        ])
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
+        expected_slice = np.array(
+            [
+                0.0,
+                0.0,
+                0.3616839,
+                0.0,
+                0.04877859,
+                0.59195685,
+                0.23902711,
+                0.00838843,
+                0.5172206,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_upscale_batch(self):
         unet = self.dummy_cond_unet_upscale
@@ -169,11 +175,9 @@ def test_stable_diffusion_upscale_batch(self):
         scheduler = DDIMScheduler(prediction_type="v_prediction")
         vae = self.dummy_vae
         text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (64, 64))
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
         sd_pipe = StableDiffusionUpscalePipeline(
             unet=unet,
             low_res_scheduler=low_res_scheduler,
@@ -181,7 +185,8 @@ def test_stable_diffusion_upscale_batch(self):
             vae=vae,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
-            max_noise_level=350, )
+            max_noise_level=350,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         output = sd_pipe(
@@ -190,7 +195,8 @@ def test_stable_diffusion_upscale_batch(self):
             guidance_scale=6.0,
             noise_level=20,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         assert image.shape[0] == 2
         generator = paddle.Generator().manual_seed(0)
@@ -202,7 +208,8 @@ def test_stable_diffusion_upscale_batch(self):
             guidance_scale=6.0,
             noise_level=20,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         assert image.shape[0] == 2
 
@@ -213,11 +220,9 @@ def test_stable_diffusion_upscale_fp16(self):
         scheduler = DDIMScheduler(prediction_type="v_prediction")
         vae = self.dummy_vae
         text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize(
-            (64, 64))
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
         unet = unet.to(dtype=paddle.float16)
         text_encoder = text_encoder.to(dtype=paddle.float16)
         sd_pipe = StableDiffusionUpscalePipeline(
@@ -227,7 +232,8 @@ def test_stable_diffusion_upscale_fp16(self):
             vae=vae,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
-            max_noise_level=350, )
+            max_noise_level=350,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -236,10 +242,10 @@ def test_stable_diffusion_upscale_fp16(self):
             image=low_res_image,
             generator=generator,
             num_inference_steps=2,
-            output_type="np", ).images
+            output_type="np",
+        ).images
         expected_height_width = low_res_image.size[0] * 4
-        assert image.shape == (1, expected_height_width, expected_height_width,
-                               3)
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
 
 
 @slow
@@ -264,8 +270,7 @@ def test_stable_diffusion_upscale_pipeline(self):
         pipe.enable_attention_slicing()
         prompt = "a cat sitting on a park bench"
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt, image=image, generator=generator, output_type="np")
+        output = pipe(prompt=prompt, image=image, generator=generator, output_type="np")
         image = output.images[0]
         assert image.shape == (512, 512, 3)
         image = image[-3:, -3:, -1]
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index daa755dc68597..b482ca6657633 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -20,9 +20,14 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         DPMSolverMultistepScheduler, EulerDiscreteScheduler,
-                         StableDiffusionPipeline, UNet2DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -46,7 +51,8 @@ def dummy_cond_unet(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
             attention_head_dim=(2, 4),
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         return model
 
     @property
@@ -59,7 +65,8 @@ def dummy_vae(self):
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
-            sample_size=128, )
+            sample_size=128,
+        )
         return model
 
     @property
@@ -76,7 +83,8 @@ def dummy_text_encoder(self):
             pad_token_id=1,
             vocab_size=1000,
             hidden_act="gelu",
-            projection_dim=64, )
+            projection_dim=64,
+        )
         return CLIPTextModel(config).eval()
 
     def test_stable_diffusion_v_pred_ddim(self):
@@ -87,11 +95,11 @@ def test_stable_diffusion_v_pred_ddim(self):
             beta_schedule="scaled_linear",
             clip_sample=False,
             set_alpha_to_one=False,
-            prediction_type="v_prediction", )
+            prediction_type="v_prediction",
+        )
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd_pipe = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -100,7 +108,8 @@ def test_stable_diffusion_v_pred_ddim(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=None,
-            requires_safety_checker=False, )
+            requires_safety_checker=False,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -109,7 +118,8 @@ def test_stable_diffusion_v_pred_ddim(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -118,24 +128,26 @@ def test_stable_diffusion_v_pred_ddim(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.36126757,
-            0.40778637,
-            0.36956796,
-            0.14816678,
-            0.25735706,
-            0.36562037,
-            0.1229952,
-            0.22826642,
-            0.4154452,
-        ])
+        expected_slice = np.array(
+            [
+                0.36126757,
+                0.40778637,
+                0.36956796,
+                0.14816678,
+                0.25735706,
+                0.36562037,
+                0.1229952,
+                0.22826642,
+                0.4154452,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_v_pred_k_euler(self):
         unet = self.dummy_cond_unet
@@ -143,11 +155,11 @@ def test_stable_diffusion_v_pred_k_euler(self):
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
-            prediction_type="v_prediction", )
+            prediction_type="v_prediction",
+        )
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd_pipe = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -156,7 +168,8 @@ def test_stable_diffusion_v_pred_k_euler(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=None,
-            requires_safety_checker=False, )
+            requires_safety_checker=False,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -165,7 +178,8 @@ def test_stable_diffusion_v_pred_k_euler(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -174,24 +188,26 @@ def test_stable_diffusion_v_pred_k_euler(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.39991996,
-            0.45191997,
-            0.34044766,
-            0.2136086,
-            0.2758901,
-            0.31222183,
-            0.21658134,
-            0.34479994,
-            0.43742967,
-        ])
+        expected_slice = np.array(
+            [
+                0.39991996,
+                0.45191997,
+                0.34044766,
+                0.2136086,
+                0.2758901,
+                0.31222183,
+                0.21658134,
+                0.34479994,
+                0.43742967,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_v_pred_fp16(self):
         """Test that stable diffusion v-prediction works with fp16"""
@@ -202,11 +218,11 @@ def test_stable_diffusion_v_pred_fp16(self):
             beta_schedule="scaled_linear",
             clip_sample=False,
             set_alpha_to_one=False,
-            prediction_type="v_prediction", )
+            prediction_type="v_prediction",
+        )
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         unet = unet.to(dtype=paddle.float16)
         vae = vae.to(dtype=paddle.float16)
         bert = bert.to(dtype=paddle.float16)
@@ -218,15 +234,12 @@ def test_stable_diffusion_v_pred_fp16(self):
             tokenizer=tokenizer,
             safety_checker=None,
             feature_extractor=None,
-            requires_safety_checker=False, )
+            requires_safety_checker=False,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
-        image = sd_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np").images
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
         assert image.shape == (1, 64, 64, 3)
 
 
@@ -239,8 +252,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_stable_diffusion_v_pred_default(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
         sd_pipe.enable_attention_slicing()
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
@@ -250,26 +262,30 @@ def test_stable_diffusion_v_pred_default(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=20,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([
-            0.05667132,
-            0.05700234,
-            0.04156408,
-            0.04631725,
-            0.04327643,
-            0.06003231,
-            0.05165312,
-            0.05258191,
-            0.0865913,
-        ])
+        expected_slice = np.array(
+            [
+                0.05667132,
+                0.05700234,
+                0.04156408,
+                0.04631725,
+                0.04327643,
+                0.06003231,
+                0.05165312,
+                0.05258191,
+                0.0865913,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_v_pred_upcast_attention(self):
         sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16)
+            "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16
+        )
         sd_pipe.enable_attention_slicing()
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
@@ -279,52 +295,51 @@ def test_stable_diffusion_v_pred_upcast_attention(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=20,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 768, 768, 3)
 
-        expected_slice = np.array([
-            0.04541016,
-            0.04516602,
-            0.05493164,
-            0.05078125,
-            0.04296875,
-            0.07275391,
-            0.06567383,
-            0.0534668,
-            0.04833984,
-        ])
+        expected_slice = np.array(
+            [
+                0.04541016,
+                0.04516602,
+                0.05493164,
+                0.05078125,
+                0.04296875,
+                0.07275391,
+                0.06567383,
+                0.0534668,
+                0.04833984,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
 
     def test_stable_diffusion_v_pred_euler(self):
-        scheduler = EulerDiscreteScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2", subfolder="scheduler")
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2", scheduler=scheduler)
+        scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
         sd_pipe.enable_attention_slicing()
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=5,
-            output_type="numpy")
+        output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy")
         image = output.images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([
-            0.03515199,
-            0.03756374,
-            0.05046153,
-            0.04240236,
-            0.05509549,
-            0.06556576,
-            0.04710263,
-            0.02758819,
-            0.05959105,
-        ])
+        expected_slice = np.array(
+            [
+                0.03515199,
+                0.03756374,
+                0.05046153,
+                0.04240236,
+                0.05509549,
+                0.06556576,
+                0.04710263,
+                0.02758819,
+                0.05959105,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_v_pred_dpm(self):
@@ -332,9 +347,9 @@ def test_stable_diffusion_v_pred_dpm(self):
         TODO: update this test after making DPM compatible with V-prediction!
         """
         scheduler = DPMSolverMultistepScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2", subfolder="scheduler")
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2", scheduler=scheduler)
+            "stabilityai/stable-diffusion-2", subfolder="scheduler"
+        )
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
         sd_pipe.enable_attention_slicing()
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "a photograph of an astronaut riding a horse"
@@ -344,20 +359,23 @@ def test_stable_diffusion_v_pred_dpm(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=5,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([
-            0.20492354,
-            0.2115368,
-            0.2323401,
-            0.2415919,
-            0.25598443,
-            0.24843931,
-            0.25171167,
-            0.23580211,
-            0.23604062,
-        ])
+        expected_slice = np.array(
+            [
+                0.20492354,
+                0.2115368,
+                0.2323401,
+                0.2415919,
+                0.25598443,
+                0.24843931,
+                0.25171167,
+                0.23580211,
+                0.23604062,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     # def test_stable_diffusion_attention_slicing_v_pred(self):
@@ -387,30 +405,27 @@ def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
         # expected_image = load_numpy(
         #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred.npy'
         #     )
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2")
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
         pipe.enable_attention_slicing()
         pipe.set_progress_bar_config(disable=None)
         prompt = "astronaut riding a horse"
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np")
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
         image = output.images[0]
         assert image.shape == (768, 768, 3)
-        expected_image = np.array([
-            0.26713198,
-            0.2630347,
-            0.25486767,
-            0.23375505,
-            0.24399692,
-            0.22363415,
-            0.24688962,
-            0.21346492,
-            0.23014635,
-        ])
+        expected_image = np.array(
+            [
+                0.26713198,
+                0.2630347,
+                0.25486767,
+                0.23375505,
+                0.24399692,
+                0.22363415,
+                0.24688962,
+                0.21346492,
+                0.23014635,
+            ]
+        )
         image = image[-3:, -3:, -1].flatten()
         assert np.abs(expected_image - image).max() < 0.075
 
@@ -419,37 +434,33 @@ def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
         # expected_image = load_numpy(
         #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy'
         #     )
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         prompt = "astronaut riding a horse"
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np")
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
         image = output.images[0]
         assert image.shape == (768, 768, 3)
-        expected_image = np.array([
-            0.26220703,
-            0.25195312,
-            0.2434082,
-            0.22753906,
-            0.23632812,
-            0.21777344,
-            0.23901367,
-            0.20629883,
-            0.22192383,
-        ])
+        expected_image = np.array(
+            [
+                0.26220703,
+                0.25195312,
+                0.2434082,
+                0.22753906,
+                0.23632812,
+                0.21777344,
+                0.23901367,
+                0.20629883,
+                0.22192383,
+            ]
+        )
         image = image[-3:, -3:, -1].flatten()
         assert np.abs(expected_image - image).max() < 0.75
 
     def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
         number_of_steps = 0
 
-        def test_callback_fn(step: int, timestep: int,
-                             latents: paddle.Tensor) -> None:
+        def test_callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
             test_callback_fn.has_been_called = True
             nonlocal number_of_steps
             number_of_steps += 1
@@ -457,40 +468,41 @@ def test_callback_fn(step: int, timestep: int,
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 96, 96)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.2542,
-                    -1.276,
-                    0.426,
-                    -0.956,
-                    -1.173,
-                    -0.5884,
-                    2.416,
-                    0.1553,
-                    -1.21,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        -0.2542,
+                        -1.276,
+                        0.426,
+                        -0.956,
+                        -1.173,
+                        -0.5884,
+                        2.416,
+                        0.1553,
+                        -1.21,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
             elif step == 19:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 96, 96)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([
-                    -0.959,
-                    -0.964,
-                    -0.614,
-                    0.0977,
-                    -0.6953,
-                    -0.2343,
-                    1.551,
-                    -0.03357,
-                    -0.11395,
-                ])
-                assert np.abs(latents_slice.flatten() - expected_slice).max(
-                ) < 0.05
+                expected_slice = np.array(
+                    [
+                        -0.959,
+                        -0.964,
+                        -0.614,
+                        0.0977,
+                        -0.6953,
+                        -0.2343,
+                        1.551,
+                        -0.03357,
+                        -0.11395,
+                    ]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
 
         test_callback_fn.has_been_called = False
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         prompt = "Andromeda galaxy in a bottle"
@@ -501,6 +513,7 @@ def test_callback_fn(step: int, timestep: int,
             guidance_scale=7.5,
             generator=generator,
             callback=test_callback_fn,
-            callback_steps=1, )
+            callback_steps=1,
+        )
         assert test_callback_fn.has_been_called
         assert number_of_steps == 20
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
index 2bfa1261d9065..b2bdac5b34ed7 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -22,10 +22,16 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler,
-                         PNDMScheduler, UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion_safe import \
-    StableDiffusionPipelineSafe as StableDiffusionPipeline
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion_safe import (
+    StableDiffusionPipelineSafe as StableDiffusionPipeline,
+)
 from ppdiffusers.utils import floats_tensor, nightly
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -41,8 +47,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = 32, 32
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     @property
@@ -56,7 +61,8 @@ def dummy_cond_unet(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         return model
 
     @property
@@ -68,7 +74,8 @@ def dummy_vae(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         return model
 
     @property
@@ -83,7 +90,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModel(config).eval()
 
     @property
@@ -108,11 +116,11 @@ def test_safe_diffusion_ddim(self):
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd_pipe = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -120,7 +128,8 @@ def test_safe_diffusion_ddim(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -129,7 +138,8 @@ def test_safe_diffusion_ddim(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -138,32 +148,33 @@ def test_safe_diffusion_ddim(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.28519452,
-            0.23807159,
-            0.38150585,
-            0.21930319,
-            0.26092738,
-            0.517212,
-            0.2563907,
-            0.2503956,
-            0.47978917,
-        ])
+        expected_slice = np.array(
+            [
+                0.28519452,
+                0.23807159,
+                0.38150585,
+                0.21930319,
+                0.26092738,
+                0.517212,
+                0.2563907,
+                0.2503956,
+                0.47978917,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_pndm(self):
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd_pipe = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -171,7 +182,8 @@ def test_stable_diffusion_pndm(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
         generator = paddle.Generator().manual_seed(0)
@@ -180,7 +192,8 @@ def test_stable_diffusion_pndm(self):
             generator=generator,
             guidance_scale=6.0,
             num_inference_steps=2,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = sd_pipe(
@@ -189,29 +202,31 @@ def test_stable_diffusion_pndm(self):
             guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            return_dict=False, )[0]
+            return_dict=False,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            0.18763152,
-            0.24242553,
-            0.36067978,
-            0.21772456,
-            0.27213728,
-            0.5194623,
-            0.2227565,
-            0.2217454,
-            0.4453961,
-        ])
+        expected_slice = np.array(
+            [
+                0.18763152,
+                0.24242553,
+                0.36067978,
+                0.21772456,
+                0.27213728,
+                0.5194623,
+                0.2227565,
+                0.2217454,
+                0.4453961,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_stable_diffusion_no_safety_checker(self):
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
         assert isinstance(pipe, StableDiffusionPipeline)
         assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
         assert pipe.safety_checker is None
@@ -219,8 +234,7 @@ def test_stable_diffusion_no_safety_checker(self):
         assert image is not None
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, from_diffusers=False)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
         assert pipe.safety_checker is None
         image = pipe("example prompt", num_inference_steps=2).images[0]
         assert image is not None
@@ -231,8 +245,7 @@ def test_stable_diffusion_fp16(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         unet = unet.to(dtype=paddle.float16)
         vae = vae.to(dtype=paddle.float16)
         bert = bert.to(dtype=paddle.float16)
@@ -243,11 +256,11 @@ def test_stable_diffusion_fp16(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger"
-        image = sd_pipe(
-            [prompt], num_inference_steps=2, output_type="np").images
+        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
         assert image.shape == (1, 64, 64, 3)
 
 
@@ -260,10 +273,8 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_harm_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and children from bahnhof zoo, detailed "
         seed = 4003660346
@@ -277,7 +288,8 @@ def test_harm_safe_stable_diffusion(self):
             output_type="np",
             width=512,
             height=512,
-            sld_guidance_scale=0, )
+            sld_guidance_scale=0,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -306,7 +318,8 @@ def test_harm_safe_stable_diffusion(self):
             sld_warmup_steps=7,
             sld_threshold=0.025,
             sld_momentum_scale=0.5,
-            sld_mom_beta=0.7, )
+            sld_mom_beta=0.7,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -324,10 +337,8 @@ def test_harm_safe_stable_diffusion(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_nudity_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(
-            sd_pipe.scheduler.config)
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
         seed = 2734971755
@@ -341,7 +352,8 @@ def test_nudity_safe_stable_diffusion(self):
             output_type="np",
             width=512,
             height=512,
-            sld_guidance_scale=0, )
+            sld_guidance_scale=0,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -370,7 +382,8 @@ def test_nudity_safe_stable_diffusion(self):
             sld_warmup_steps=7,
             sld_threshold=0.025,
             sld_momentum_scale=0.5,
-            sld_mom_beta=0.7, )
+            sld_mom_beta=0.7,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = [
@@ -388,8 +401,7 @@ def test_nudity_safe_stable_diffusion(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_nudity_safetychecker_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         sd_pipe.set_progress_bar_config(disable=None)
         prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
         seed = 1044355234
@@ -403,7 +415,8 @@ def test_nudity_safetychecker_safe_stable_diffusion(self):
             output_type="np",
             width=512,
             height=512,
-            sld_guidance_scale=0, )
+            sld_guidance_scale=0,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
         expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
@@ -422,12 +435,10 @@ def test_nudity_safetychecker_safe_stable_diffusion(self):
             sld_warmup_steps=7,
             sld_threshold=0.025,
             sld_momentum_scale=0.5,
-            sld_mom_beta=0.7, )
+            sld_mom_beta=0.7,
+        )
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([
-            0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334,
-            0.6561
-        ])
+        expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561])
         assert image.shape == (1, 512, 512, 3)
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
index 79cfcb2145995..fb5982706c2c9 100644
--- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -16,14 +16,24 @@
 import unittest
 
 import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel,
-                                    CLIPTextModelWithProjection, CLIPTokenizer)
+from paddlenlp.transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         PriorTransformer, StableUnCLIPPipeline,
-                         UNet2DConditionModel)
-from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import \
-    StableUnCLIPImageNormalizer
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    PriorTransformer,
+    StableUnCLIPPipeline,
+    UNet2DConditionModel,
+)
+from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import (
+    StableUnCLIPImageNormalizer,
+)
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
@@ -39,8 +49,7 @@ def get_dummy_components(self):
         embedder_hidden_size = 32
         embedder_projection_dim = embedder_hidden_size
         paddle.seed(0)
-        prior_tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         paddle.seed(0)
         prior_text_encoder = CLIPTextModelWithProjection(
             CLIPTextConfig(
@@ -53,13 +62,16 @@ def get_dummy_components(self):
                 num_attention_heads=4,
                 num_hidden_layers=5,
                 pad_token_id=1,
-                vocab_size=1000, ))
+                vocab_size=1000,
+            )
+        )
         paddle.seed(0)
         prior = PriorTransformer(
             num_attention_heads=2,
             attention_head_dim=12,
             embedding_dim=embedder_projection_dim,
-            num_layers=1, )
+            num_layers=1,
+        )
         paddle.seed(0)
         prior_scheduler = DDPMScheduler(
             variance_type="fixed_small_log",
@@ -67,15 +79,13 @@ def get_dummy_components(self):
             num_train_timesteps=1000,
             clip_sample=True,
             clip_sample_range=5.0,
-            beta_schedule="squaredcos_cap_v2", )
+            beta_schedule="squaredcos_cap_v2",
+        )
         paddle.seed(0)
-        image_normalizer = StableUnCLIPImageNormalizer(
-            embedding_dim=embedder_hidden_size)
-        image_noising_scheduler = DDPMScheduler(
-            beta_schedule="squaredcos_cap_v2")
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
+        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
         paddle.seed(0)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         paddle.seed(0)
         text_encoder = CLIPTextModel(
             CLIPTextConfig(
@@ -88,7 +98,9 @@ def get_dummy_components(self):
                 num_attention_heads=4,
                 num_hidden_layers=5,
                 pad_token_id=1,
-                vocab_size=1000, ))
+                vocab_size=1000,
+            )
+        )
         paddle.seed(0)
         unet = UNet2DConditionModel(
             sample_size=32,
@@ -103,7 +115,8 @@ def get_dummy_components(self):
             cross_attention_dim=embedder_hidden_size,
             layers_per_block=1,
             upcast_attention=True,
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         paddle.seed(0)
         scheduler = DDIMScheduler(
             beta_schedule="scaled_linear",
@@ -111,7 +124,8 @@ def get_dummy_components(self):
             beta_end=0.012,
             prediction_type="v_prediction",
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
         paddle.seed(0)
         vae = AutoencoderKL()
         components = {
@@ -143,13 +157,11 @@ def get_dummy_inputs(self, seed=0):
 
     def test_attention_slicing_forward_pass(self):
         test_max_difference = False
-        self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference)
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
 
     def test_inference_batch_single_identical(self):
         test_max_difference = False
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference)
+        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
 
 
 # @slow
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index aa2328fb72a16..eb769ee92815b 100644
--- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -19,24 +19,36 @@
 import numpy as np
 import paddle
 from paddlenlp.transformers import (
-    CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer,
-    CLIPVisionConfig, CLIPVisionModelWithProjection)
-
-from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler,
-                         StableUnCLIPImg2ImgPipeline, UNet2DConditionModel)
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    StableUnCLIPImg2ImgPipeline,
+    UNet2DConditionModel,
+)
 from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import \
-    StableUnCLIPImageNormalizer
+from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import (
+    StableUnCLIPImageNormalizer,
+)
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 from ppdiffusers.utils.testing_utils import floats_tensor
 
-from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-                               TEXT_GUIDED_IMAGE_VARIATION_PARAMS)
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin,
-                                           unittest.TestCase):
+class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableUnCLIPImg2ImgPipeline
     params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
     batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
@@ -53,15 +65,14 @@ def get_dummy_components(self):
                 num_attention_heads=4,
                 image_size=32,
                 intermediate_size=37,
-                patch_size=1, ))
+                patch_size=1,
+            )
+        )
         paddle.seed(0)
-        image_normalizer = StableUnCLIPImageNormalizer(
-            embedding_dim=embedder_hidden_size)
-        image_noising_scheduler = DDPMScheduler(
-            beta_schedule="squaredcos_cap_v2")
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
+        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
         paddle.seed(0)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         paddle.seed(0)
         text_encoder = CLIPTextModel(
             CLIPTextConfig(
@@ -74,7 +85,9 @@ def get_dummy_components(self):
                 num_attention_heads=4,
                 num_hidden_layers=5,
                 pad_token_id=1,
-                vocab_size=1000, ))
+                vocab_size=1000,
+            )
+        )
         paddle.seed(0)
         unet = UNet2DConditionModel(
             sample_size=32,
@@ -89,7 +102,8 @@ def get_dummy_components(self):
             cross_attention_dim=embedder_hidden_size,
             layers_per_block=1,
             upcast_attention=True,
-            use_linear_projection=True, )
+            use_linear_projection=True,
+        )
         paddle.seed(0)
         scheduler = DDIMScheduler(
             beta_schedule="scaled_linear",
@@ -97,7 +111,8 @@ def get_dummy_components(self):
             beta_end=0.012,
             prediction_type="v_prediction",
             set_alpha_to_one=False,
-            steps_offset=1, )
+            steps_offset=1,
+        )
         paddle.seed(0)
         vae = AutoencoderKL()
         components = {
@@ -124,17 +139,19 @@ def test_image_embeds_none(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([
-            0.40317363,
-            1.0,
-            0.5802471,
-            0.47334313,
-            0.39546987,
-            0.72409034,
-            0.15691131,
-            0.42981434,
-            0.72585064,
-        ])
+        expected_slice = np.array(
+            [
+                0.40317363,
+                1.0,
+                0.5802471,
+                0.47334313,
+                0.39546987,
+                0.72409034,
+                0.15691131,
+                0.42981434,
+                0.72585064,
+            ]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -145,8 +162,7 @@ def get_dummy_inputs(self, seed=0, pil_image=True):
         if pil_image:
             input_image = input_image * 0.5 + 0.5
             input_image = input_image.clip(min=0, max=1)
-            input_image = (input_image.cpu().transpose(
-                perm=[0, 2, 3, 1]).cast("float32").numpy())
+            input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy()
             input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
         return {
             "prompt": "An anime racoon running a marathon",
@@ -158,21 +174,18 @@ def get_dummy_inputs(self, seed=0, pil_image=True):
 
     def test_attention_slicing_forward_pass(self):
         test_max_difference = False
-        self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference)
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
 
     def test_inference_batch_single_identical(self):
         test_max_difference = False
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference)
+        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
 
     @unittest.skipIf(
         not is_ppxformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
     )
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            test_max_difference=False)
+        self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
 
 
 # @slow
diff --git a/ppdiffusers/tests/pipelines/test_pipelines.py b/ppdiffusers/tests/pipelines/test_pipelines.py
index ce6bcc0752a00..ef0b785f3ed4a 100644
--- a/ppdiffusers/tests/pipelines/test_pipelines.py
+++ b/ppdiffusers/tests/pipelines/test_pipelines.py
@@ -18,7 +18,6 @@
 import os
 import random
 import shutil
-import sys
 import tempfile
 import unittest
 import unittest.mock as mock
@@ -29,24 +28,50 @@
 import requests_mock
 import safetensors.torch
 from paddlenlp.transformers import (
-    CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer)
+    CLIPImageProcessor,
+    CLIPModel,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
 from parameterized import parameterized
 from PIL import Image
 from requests.exceptions import HTTPError
 
 from ppdiffusers import (
-    AutoencoderKL, DDIMPipeline, DDIMScheduler, DDPMPipeline, DDPMScheduler,
-    DiffusionPipeline, DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
-    LMSDiscreteScheduler, PNDMScheduler, StableDiffusionImg2ImgPipeline,
-    StableDiffusionInpaintPipelineLegacy, StableDiffusionPipeline,
-    UNet2DConditionModel, UNet2DModel, logging)
+    AutoencoderKL,
+    DDIMPipeline,
+    DDIMScheduler,
+    DDPMPipeline,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipelineLegacy,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNet2DModel,
+    logging,
+)
 from ppdiffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from ppdiffusers.utils import (CONFIG_NAME, TORCH_WEIGHTS_NAME, floats_tensor,
-                               nightly, slow)
-from ppdiffusers.utils.testing_utils import (CaptureLogger, get_tests_dir,
-                                             require_compel, require_paddle_gpu,
-                                             require_torch)
+from ppdiffusers.utils import (
+    CONFIG_NAME,
+    TORCH_WEIGHTS_NAME,
+    floats_tensor,
+    nightly,
+    slow,
+)
+from ppdiffusers.utils.testing_utils import (
+    CaptureLogger,
+    get_tests_dir,
+    require_compel,
+    require_paddle_gpu,
+    require_torch,
+)
 
 
 class DownloadTests(unittest.TestCase):
@@ -57,12 +82,12 @@ def test_one_request_upon_cached(self):
                     "hf-internal-testing/tiny-stable-diffusion-pipe",
                     cache_dir=tmpdirname,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
             download_requests = [r.method for r in m.request_history]
             assert download_requests.count("HEAD") == 15, "15 calls to files"
-            assert (download_requests.count("GET") == 17
-                    ), "15 calls to files + model_info + model_index.json"
+            assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json"
             assert (
                 len(download_requests) == 32
             ), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json"
@@ -73,11 +98,11 @@ def test_one_request_upon_cached(self):
                     safety_checker=None,
                     cache_dir=tmpdirname,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
             cache_requests = [r.method for r in m.request_history]
-            assert cache_requests.count(
-                "HEAD") == 1, "model_index.json is only HEAD"
+            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
             assert cache_requests.count("GET") == 1, "model info is only GET"
             assert (
                 len(cache_requests) == 2
@@ -90,7 +115,8 @@ def test_less_downloads_passed_object(self):
                 safety_checker=None,
                 cache_dir=tmpdirname,
                 from_hf_hub=True,
-                from_diffusers=True, )
+                from_diffusers=True,
+            )
 
             # make sure safety checker is not downloaded
             assert "safety_checker" not in os.listdir(cached_folder)
@@ -112,14 +138,14 @@ def test_less_downloads_passed_object_calls(self):
                     safety_checker=None,
                     cache_dir=tmpdirname,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
             download_requests = [r.method for r in m.request_history]
             # 15 - 2 because no call to config or model file for `safety_checker`
             assert download_requests.count("HEAD") == 13, "13 calls to files"
             # 17 - 2 because no call to config or model file for `safety_checker`
-            assert (download_requests.count("GET") == 15
-                    ), "13 calls to files + model_info + model_index.json"
+            assert download_requests.count("GET") == 15, "13 calls to files + model_info + model_index.json"
             assert (
                 len(download_requests) == 28
             ), "2 calls per file (13 files) + send_telemetry, model_info and model_index.json"
@@ -130,11 +156,11 @@ def test_less_downloads_passed_object_calls(self):
                     safety_checker=None,
                     cache_dir=tmpdirname,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
             cache_requests = [r.method for r in m.request_history]
-            assert cache_requests.count(
-                "HEAD") == 1, "model_index.json is only HEAD"
+            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
             assert cache_requests.count("GET") == 1, "model info is only GET"
             assert (
                 len(cache_requests) == 2
@@ -147,15 +173,11 @@ def test_download_only_pytorch(self):
                 safety_checker=None,
                 cache_dir=tmpdirname,
                 from_hf_hub=True,
-                from_diffusers=True, )
+                from_diffusers=True,
+            )
 
             all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-            all_root_files = [
-                t[-1]
-                for t in os.walk(
-                    os.path.join(tmpdirname,
-                                 os.listdir(tmpdirname)[0], "snapshots"))
-            ]
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
             files = [item for sublist in all_root_files for item in sublist]
             assert not any(f.endswith(".msgpack") for f in files)
             assert not any(f.endswith(".safetensors") for f in files)
@@ -163,25 +185,18 @@ def test_download_only_pytorch(self):
     def test_returned_cached_folder(self):
         prompt = "hello"
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
         _, local_path = StableDiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch",
             safety_checker=None,
-            return_cached_folder=True, )
+            return_cached_folder=True,
+        )
         pipe_2 = StableDiffusionPipeline.from_pretrained(local_path)
         generator = paddle.Generator().manual_seed(0)
-        out = pipe(
-            prompt,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="numpy").images
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
-        out_2 = pipe_2(
-            prompt,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="numpy").images
+        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         assert np.max(np.abs(out - out_2)) < 0.001
 
     def test_force_safetensors_error(self):
@@ -194,7 +209,8 @@ def test_force_safetensors_error(self):
                     from_diffusers=True,
                     safety_checker=None,
                     cache_dir=tmpdirname,
-                    use_safetensors=True, )
+                    use_safetensors=True,
+                )
 
     def test_download_safetensors(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -204,7 +220,8 @@ def test_download_safetensors(self):
                 from_diffusers=True,
                 safety_checker=None,
                 cache_dir=tmpdirname,
-                use_safetensors=True, )
+                use_safetensors=True,
+            )
 
             all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
             files = [item for sublist in all_root_files for item in sublist]
@@ -219,11 +236,10 @@ def test_download_safetensors_index(self):
                     use_safetensors=True,
                     variant=variant,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
-                all_root_files = [
-                    t[-1] for t in os.walk(os.path.join(tmpdirname))
-                ]
+                all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
                 files = [item for sublist in all_root_files for item in sublist]
 
                 # None of the downloaded files should be a safetensors file even if we have some here:
@@ -246,11 +262,10 @@ def test_download_bin_index(self):
                     use_safetensors=False,
                     variant=variant,
                     from_hf_hub=True,
-                    from_diffusers=True, )
+                    from_diffusers=True,
+                )
 
-                all_root_files = [
-                    t[-1] for t in os.walk(os.path.join(tmpdirname))
-                ]
+                all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
                 files = [item for sublist in all_root_files for item in sublist]
 
                 # None of the downloaded files should be a safetensors file even if we have some here:
@@ -267,66 +282,39 @@ def test_download_bin_index(self):
     def test_download_no_safety_checker(self):
         prompt = "hello"
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
         generator = paddle.Generator().manual_seed(0)
-        out = pipe(
-            prompt,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="numpy").images
-        pipe_2 = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch")
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
+        pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
         generator = paddle.Generator().manual_seed(0)
-        out_2 = pipe_2(
-            prompt,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="numpy").images
+        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         assert np.max(np.abs(out - out_2)) < 0.001
 
     def test_load_no_safety_checker_explicit_locally(self):
         prompt = "hello"
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
         generator = paddle.Generator().manual_seed(0)
-        out = pipe(
-            prompt,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="numpy").images
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe_2 = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, safety_checker=None)
+            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
             generator = paddle.Generator().manual_seed(0)
-            out_2 = pipe_2(
-                prompt,
-                num_inference_steps=2,
-                generator=generator,
-                output_type="numpy").images
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         assert np.max(np.abs(out - out_2)) < 0.001
 
     def test_load_no_safety_checker_default_locally(self):
         prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch")
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
         generator = paddle.Generator().manual_seed(0)
-        out = pipe(
-            prompt,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="numpy").images
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
             pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
             generator = paddle.Generator().manual_seed(0)
-            out_2 = pipe_2(
-                prompt,
-                num_inference_steps=2,
-                generator=generator,
-                output_type="numpy").images
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
         assert np.max(np.abs(out - out_2)) < 0.001
 
     def test_cached_files_are_used_when_no_internet(self):
@@ -336,21 +324,16 @@ def test_cached_files_are_used_when_no_internet(self):
         response_mock.raise_for_status.side_effect = HTTPError
         response_mock.json.return_value = {}
         orig_pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            safety_checker=None)
-        orig_comps = {
-            k: v
-            for k, v in orig_pipe.components.items() if hasattr(v, "parameters")
-        }
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")}
         with mock.patch("requests.request", return_value=response_mock):
             pipe = StableDiffusionPipeline.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 safety_checker=None,
-                local_files_only=True, )
-            comps = {
-                k: v
-                for k, v in pipe.components.items() if hasattr(v, "parameters")
-            }
+                local_files_only=True,
+            )
+            comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
         for m1, m2 in zip(orig_comps.values(), comps.values()):
             for p1, p2 in zip(m1.parameters(), m2.parameters()):
                 if (p1 != p2).sum() > 0:
@@ -365,11 +348,11 @@ def test_download_from_variant_folder(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 tmpdirname = StableDiffusionPipeline.download(
                     "hf-internal-testing/stable-diffusion-all-variants",
-                    cache_dir=tmpdirname, )
+                    cache_dir=tmpdirname,
+                )
                 all_root_files = [t[-1] for t in os.walk(tmpdirname)]
                 files = [item for sublist in all_root_files for item in sublist]
-                assert (len(files) == 15
-                        ), f"We should only download 15 files, not {len(files)}"
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
                 assert not any(f.endswith(other_format) for f in files)
                 assert not any(len(f.split(".")) == 3 for f in files)
         ppdiffusers.utils.import_utils._safetensors_available = True
@@ -386,22 +369,15 @@ def test_download_variant_all(self):
                 StableDiffusionPipeline.from_pretrained(
                     "hf-internal-testing/stable-diffusion-all-variants",
                     cache_dir=tmpdirname,
-                    variant=variant, )
+                    variant=variant,
+                )
                 all_root_files = [
-                    t[-1]
-                    for t in os.walk(
-                        os.path.join(tmpdirname,
-                                     os.listdir(tmpdirname)[0], "snapshots"))
+                    t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))
                 ]
                 files = [item for sublist in all_root_files for item in sublist]
-                assert (len(files) == 15
-                        ), f"We should only download 15 files, not {len(files)}"
-                assert (len([
-                    f for f in files if f.endswith(f"{variant}{this_format}")
-                ]) == 4)
-                assert not any(
-                    f.endswith(this_format) and
-                    not f.endswith(f"{variant}{this_format}") for f in files)
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4
+                assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files)
                 assert not any(f.endswith(other_format) for f in files)
         ppdiffusers.utils.import_utils._safetensors_available = True
 
@@ -417,21 +393,16 @@ def test_download_variant_partly(self):
                 tmpdirname = StableDiffusionPipeline.download(
                     "hf-internal-testing/stable-diffusion-all-variants",
                     cache_dir=tmpdirname,
-                    variant=variant, )
+                    variant=variant,
+                )
                 all_root_files = [t[-1] for t in os.walk(tmpdirname)]
                 files = [item for sublist in all_root_files for item in sublist]
 
                 unet_files = os.listdir(os.path.join(tmpdirname, "unet"))
-                assert (len(files) == 15
-                        ), f"We should only download 15 files, not {len(files)}"
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
                 assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
-                assert (len([
-                    f for f in files if f.endswith(f"{variant}{this_format}")
-                ]) == 1)
-                assert (sum(
-                    f.endswith(this_format) and
-                    not f.endswith(f"{variant}{this_format}")
-                    for f in files) == 3)
+                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
+                assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
                 assert not any(f.endswith(other_format) for f in files)
         ppdiffusers.utils.import_utils._safetensors_available = True
 
@@ -467,59 +438,52 @@ def test_local_save_load_index(self):
     @require_torch
     def test_text_inversion_download(self):
         pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
         import torch
 
         num_tokens = len(pipe.tokenizer)
 
         # single token load local
         with tempfile.TemporaryDirectory() as tmpdirname:
-            ten = {"<*>": torch.ones((32, ))}
+            ten = {"<*>": torch.ones((32,))}
             torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
 
             pipe.load_textual_inversion(tmpdirname, from_diffusers=True)
 
             token = pipe.tokenizer.convert_tokens_to_ids("<*>")
             assert token == num_tokens, "Added token must be at spot `num_tokens`"
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
-                == 32)
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32
             assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>"
 
             prompt = "hey <*>"
-            out = pipe(
-                prompt, num_inference_steps=1, output_type="numpy").images
+            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
             assert out.shape == (1, 128, 128, 3)
 
             # single token load local with weight name
             ten = {"<**>": 2 * torch.ones((1, 32))}
             torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
 
-            pipe.load_textual_inversion(
-                tmpdirname,
-                weight_name="learned_embeds.bin",
-                from_diffusers=True)
+            pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin", from_diffusers=True)
 
             token = pipe.tokenizer.convert_tokens_to_ids("<**>")
             assert token == num_tokens + 1, "Added token must be at spot `num_tokens`"
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
-                == 64)
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
             assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>"
 
             prompt = "hey <**>"
-            out = pipe(
-                prompt, num_inference_steps=1, output_type="numpy").images
+            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
             assert out.shape == (1, 128, 128, 3)
 
             # multi token load
             ten = {
-                "<***>": torch.cat([
-                    3 * torch.ones((1, 32)),
-                    4 * torch.ones((1, 32)),
-                    5 * torch.ones((1, 32)),
-                ])
+                "<***>": torch.cat(
+                    [
+                        3 * torch.ones((1, 32)),
+                        4 * torch.ones((1, 32)),
+                        5 * torch.ones((1, 32)),
+                    ]
+                )
             }
             torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
 
@@ -532,38 +496,31 @@ def test_text_inversion_download(self):
             assert token == num_tokens + 2, "Added token must be at spot `num_tokens`"
             assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`"
             assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`"
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-3].sum().item()
-                == 96)
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-2].sum().item()
-                == 128)
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
-                == 160)
-            assert (pipe._maybe_convert_prompt("<***>", pipe.tokenizer) ==
-                    "<***> <***>_1 <***>_2")
+            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+            assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***> <***>_1 <***>_2"
 
             prompt = "hey <***>"
-            out = pipe(
-                prompt, num_inference_steps=1, output_type="numpy").images
+            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
             assert out.shape == (1, 128, 128, 3)
 
             # multi token load a1111
             ten = {
                 "string_to_param": {
-                    "*": torch.cat([
-                        3 * torch.ones((1, 32)),
-                        4 * torch.ones((1, 32)),
-                        5 * torch.ones((1, 32)),
-                    ])
+                    "*": torch.cat(
+                        [
+                            3 * torch.ones((1, 32)),
+                            4 * torch.ones((1, 32)),
+                            5 * torch.ones((1, 32)),
+                        ]
+                    )
                 },
                 "name": "<****>",
             }
             torch.save(ten, os.path.join(tmpdirname, "a1111.bin"))
 
-            pipe.load_textual_inversion(
-                tmpdirname, weight_name="a1111.bin", from_diffusers=True)
+            pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin", from_diffusers=True)
 
             token = pipe.tokenizer.convert_tokens_to_ids("<****>")
             token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1")
@@ -572,21 +529,13 @@ def test_text_inversion_download(self):
             assert token == num_tokens + 5, "Added token must be at spot `num_tokens`"
             assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`"
             assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`"
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-3].sum().item()
-                == 96)
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-2].sum().item()
-                == 128)
-            assert (
-                pipe.text_encoder.get_input_embeddings().weight[-1].sum().item()
-                == 160)
-            assert (pipe._maybe_convert_prompt("<****>", pipe.tokenizer) ==
-                    "<****> <****>_1 <****>_2")
+            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
+            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
+            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
+            assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****> <****>_1 <****>_2"
 
             prompt = "hey <****>"
-            out = pipe(
-                prompt, num_inference_steps=1, output_type="numpy").images
+            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
             assert out.shape == (1, 128, 128, 3)
 
     def test_download_ignore_files(self):
@@ -595,20 +544,16 @@ def test_download_ignore_files(self):
             # pipeline has Flax weights
             tmpdirname = DiffusionPipeline.download(
                 "hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files",
-                cache_dir=tmpdirname, )
+                cache_dir=tmpdirname,
+            )
             files = []
             for root, ds, fs in os.walk(tmpdirname):
                 for f in fs:
-                    str_path = str(os.path.join(root, f)).replace(
-                        str(tmpdirname) + "/", "")
+                    str_path = str(os.path.join(root, f)).replace(str(tmpdirname) + "/", "")
                     files.append(str_path)
             # None of the downloaded files should be a pytorch file even if we have some here:
             # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
-            assert not any(f in files
-                           for f in [
-                               "vae/diffusion_pytorch_model.bin",
-                               "text_encoder/config.json"
-                           ])
+            assert not any(f in files for f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"])
             assert len(files) == 13
 
 
@@ -616,7 +561,8 @@ class CustomPipelineTests(unittest.TestCase):
     def test_load_custom_pipeline(self):
         pipeline = DiffusionPipeline.from_pretrained(
             "google/ddpm-cifar10-32",
-            custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", )
+            custom_pipeline="junnyu/ppdiffusers-dummy-pipeline",
+        )
         pipeline = pipeline
         assert pipeline.__class__.__name__ == "CustomPipeline"
 
@@ -644,7 +590,8 @@ def test_load_custom_pipeline(self):
     def test_run_custom_pipeline(self):
         pipeline = DiffusionPipeline.from_pretrained(
             "google/ddpm-cifar10-32",
-            custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", )
+            custom_pipeline="junnyu/ppdiffusers-dummy-pipeline",
+        )
         pipeline = pipeline
         images, output_str = pipeline(num_inference_steps=2, output_type="np")
         assert images[0].shape == (1, 32, 32, 3)
@@ -653,8 +600,8 @@ def test_run_custom_pipeline(self):
     def test_local_custom_pipeline_repo(self):
         local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
         pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32",
-            custom_pipeline=local_custom_pipeline_path)
+            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+        )
         pipeline = pipeline
         images, output_str = pipeline(num_inference_steps=2, output_type="np")
         assert pipeline.__class__.__name__ == "CustomLocalPipeline"
@@ -663,11 +610,10 @@ def test_local_custom_pipeline_repo(self):
 
     def test_local_custom_pipeline_file(self):
         local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
-        local_custom_pipeline_path = os.path.join(local_custom_pipeline_path,
-                                                  "what_ever.py")
+        local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py")
         pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32",
-            custom_pipeline=local_custom_pipeline_path)
+            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+        )
         pipeline = pipeline
         images, output_str = pipeline(num_inference_steps=2, output_type="np")
         assert pipeline.__class__.__name__ == "CustomLocalPipeline"
@@ -678,13 +624,13 @@ def test_local_custom_pipeline_file(self):
     @require_paddle_gpu
     def test_download_from_git(self):
         clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
-        feature_extractor = CLIPImageProcessor.from_pretrained(
-            clip_model_id, from_hf_hub=False)
+        feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id, from_hf_hub=False)
         clip_model = CLIPModel.from_pretrained(
             clip_model_id,
             paddle_dtype=paddle.float16,
             from_hf_hub=False,
-            from_diffusers=False, )
+            from_diffusers=False,
+        )
         pipeline = DiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             custom_pipeline="clip_guided_stable_diffusion",
@@ -692,17 +638,17 @@ def test_download_from_git(self):
             feature_extractor=feature_extractor,
             paddle_dtype=paddle.float16,
             from_hf_hub=False,
-            from_diffusers=False, )
+            from_diffusers=False,
+        )
         pipeline.enable_attention_slicing()
         assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion"
-        image = pipeline(
-            "a prompt", num_inference_steps=2, output_type="np").images[0]
+        image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
         assert image.shape == (512, 512, 3)
 
     def test_save_pipeline_change_config(self):
         pipe = DiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            safety_checker=None)
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
@@ -710,8 +656,7 @@ def test_save_pipeline_change_config(self):
 
             assert pipe.scheduler.__class__.__name__ == "PNDMScheduler"
 
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-                pipe.scheduler.config)
+            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
             pipe.save_pretrained(tmpdirname)
             pipe = DiffusionPipeline.from_pretrained(tmpdirname)
 
@@ -732,8 +677,7 @@ def dummy_image(self):
         batch_size = 1
         num_channels = 3
         sizes = 32, 32
-        image = floats_tensor(
-            (batch_size, num_channels) + sizes, rng=random.Random(0))
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
         return image
 
     def dummy_uncond_unet(self, sample_size=32):
@@ -745,7 +689,8 @@ def dummy_uncond_unet(self, sample_size=32):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         return model
 
     def dummy_cond_unet(self, sample_size=32):
@@ -758,7 +703,8 @@ def dummy_cond_unet(self, sample_size=32):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32, )
+            cross_attention_dim=32,
+        )
         return model
 
     @property
@@ -770,7 +716,8 @@ def dummy_vae(self):
             out_channels=3,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4, )
+            latent_channels=4,
+        )
         return model
 
     @property
@@ -785,7 +732,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModel(config).eval()
 
     @property
@@ -803,24 +751,21 @@ def to(self, device):
 
         return extract
 
-    @parameterized.expand([
-        [DDIMScheduler, DDIMPipeline, 32],
-        [DDPMScheduler, DDPMPipeline, 32],
-        [DDIMScheduler, DDIMPipeline, (32, 64)],
-        [DDPMScheduler, DDPMPipeline, (64, 32)],
-    ])
-    def test_uncond_unet_components(self,
-                                    scheduler_fn=DDPMScheduler,
-                                    pipeline_fn=DDPMPipeline,
-                                    sample_size=32):
+    @parameterized.expand(
+        [
+            [DDIMScheduler, DDIMPipeline, 32],
+            [DDPMScheduler, DDPMPipeline, 32],
+            [DDIMScheduler, DDIMPipeline, (32, 64)],
+            [DDPMScheduler, DDPMPipeline, (64, 32)],
+        ]
+    )
+    def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
         unet = self.dummy_uncond_unet(sample_size)
         scheduler = scheduler_fn()
         pipeline = pipeline_fn(unet, scheduler)
         generator = paddle.Generator().manual_seed(0)
-        out_image = pipeline(
-            generator=generator, num_inference_steps=2, output_type="np").images
-        sample_size = ((sample_size, sample_size)
-                       if isinstance(sample_size, int) else sample_size)
+        out_image = pipeline(generator=generator, num_inference_steps=2, output_type="np").images
+        sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size
         assert out_image.shape == (1, *sample_size, 3)
 
     def test_stable_diffusion_components(self):
@@ -829,13 +774,10 @@ def test_stable_diffusion_components(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         image = self.dummy_image().cpu().transpose(perm=[0, 2, 3, 1])[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = (
-            Image.fromarray(np.uint8(image + 4)).convert("RGB").resize(
-                (32, 32)))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
         inpaint = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
@@ -843,7 +785,8 @@ def test_stable_diffusion_components(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         img2img = StableDiffusionImg2ImgPipeline(**inpaint.components)
         text2img = StableDiffusionPipeline(**inpaint.components)
         prompt = "A painting of a squirrel eating a burger"
@@ -854,18 +797,16 @@ def test_stable_diffusion_components(self):
             num_inference_steps=2,
             output_type="np",
             image=init_image,
-            mask_image=mask_image, ).images
+            mask_image=mask_image,
+        ).images
         image_img2img = img2img(
             [prompt],
             generator=generator,
             num_inference_steps=2,
             output_type="np",
-            image=init_image, ).images
-        image_text2img = text2img(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np").images
+            image=init_image,
+        ).images
+        image_text2img = text2img([prompt], generator=generator, num_inference_steps=2, output_type="np").images
         assert image_inpaint.shape == (1, 32, 32, 3)
         assert image_img2img.shape == (1, 32, 32, 3)
         assert image_text2img.shape == (1, 64, 64, 3)
@@ -875,8 +816,7 @@ def test_set_scheduler(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd = StableDiffusionPipeline(
             unet=unet,
             scheduler=scheduler,
@@ -884,7 +824,8 @@ def test_set_scheduler(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
         assert isinstance(sd.scheduler, DDIMScheduler)
         sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config)
@@ -895,11 +836,9 @@ def test_set_scheduler(self):
         assert isinstance(sd.scheduler, LMSDiscreteScheduler)
         sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config)
         assert isinstance(sd.scheduler, EulerDiscreteScheduler)
-        sd.scheduler = EulerAncestralDiscreteScheduler.from_config(
-            sd.scheduler.config)
+        sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config)
         assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler)
-        sd.scheduler = DPMSolverMultistepScheduler.from_config(
-            sd.scheduler.config)
+        sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config)
         assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
 
     def test_set_component_to_none(self):
@@ -907,8 +846,7 @@ def test_set_component_to_none(self):
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         pipeline = StableDiffusionPipeline(
             unet=unet,
@@ -917,7 +855,8 @@ def test_set_component_to_none(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
 
         generator = paddle.Generator().manual_seed(0)
 
@@ -927,7 +866,8 @@ def test_set_component_to_none(self):
             prompt=prompt,
             generator=generator,
             num_inference_steps=1,
-            output_type="np", ).images
+            output_type="np",
+        ).images
 
         pipeline.feature_extractor = None
         generator = paddle.Generator().manual_seed(0)
@@ -935,23 +875,19 @@ def test_set_component_to_none(self):
             prompt=prompt,
             generator=generator,
             num_inference_steps=1,
-            output_type="np", ).images
+            output_type="np",
+        ).images
 
         assert out_image.shape == (1, 64, 64, 3)
         assert np.abs(out_image - out_image_2).max() < 1e-3
 
     def test_set_scheduler_consistency(self):
         unet = self.dummy_cond_unet()
-        pndm = PNDMScheduler.from_config(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            subfolder="scheduler")
-        ddim = DDIMScheduler.from_config(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            subfolder="scheduler")
+        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd = StableDiffusionPipeline(
             unet=unet,
             scheduler=pndm,
@@ -959,15 +895,13 @@ def test_set_scheduler_consistency(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         pndm_config = sd.scheduler.config
         sd.scheduler = DDPMScheduler.from_config(pndm_config)
         sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
         pndm_config_2 = sd.scheduler.config
-        pndm_config_2 = {
-            k: v
-            for k, v in pndm_config_2.items() if k in pndm_config
-        }
+        pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config}
         assert dict(pndm_config) == dict(pndm_config_2)
         sd = StableDiffusionPipeline(
             unet=unet,
@@ -976,40 +910,33 @@ def test_set_scheduler_consistency(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         ddim_config = sd.scheduler.config
         sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config)
         sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
         ddim_config_2 = sd.scheduler.config
-        ddim_config_2 = {
-            k: v
-            for k, v in ddim_config_2.items() if k in ddim_config
-        }
+        ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config}
         assert dict(ddim_config) == dict(ddim_config_2)
 
     def test_save_safe_serialization(self):
         pipeline = StableDiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch",
             from_hf_hub=True,
-            from_diffusers=True, )
+            from_diffusers=True,
+        )
         with tempfile.TemporaryDirectory() as tmpdirname:
-            pipeline.save_pretrained(
-                tmpdirname, safe_serialization=True, to_diffusers=True)
-            vae_path = os.path.join(tmpdirname, "vae",
-                                    "diffusion_pytorch_model.safetensors")
+            pipeline.save_pretrained(tmpdirname, safe_serialization=True, to_diffusers=True)
+            vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors")
             assert os.path.exists(vae_path), f"Could not find {vae_path}"
             _ = safetensors.torch.load_file(vae_path)
-            unet_path = os.path.join(tmpdirname, "unet",
-                                     "diffusion_pytorch_model.safetensors")
+            unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors")
             assert os.path.exists(unet_path), f"Could not find {unet_path}"
             _ = safetensors.torch.load_file(unet_path)
-            text_encoder_path = os.path.join(tmpdirname, "text_encoder",
-                                             "model.safetensors")
-            assert os.path.exists(
-                text_encoder_path), f"Could not find {text_encoder_path}"
+            text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors")
+            assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}"
             _ = safetensors.torch.load_file(text_encoder_path)
-            pipeline = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, from_diffusers=True)
+            pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=True)
             assert pipeline.unet is not None
             assert pipeline.vae is not None
             assert pipeline.text_encoder is not None
@@ -1020,17 +947,17 @@ def test_no_pytorch_download_when_doing_safetensors(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             _ = StableDiffusionPipeline.from_pretrained(
                 "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
-                cache_dir=tmpdirname, )
+                cache_dir=tmpdirname,
+            )
             path = os.path.join(
                 tmpdirname,
                 "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
                 "snapshots",
                 "07838d72e12f9bcec1375b0482b80c1d399be843",
-                "unet", )
-            assert os.path.exists(
-                os.path.join(path, "diffusion_pytorch_model.safetensors"))
-            assert not os.path.exists(
-                os.path.join(path, "diffusion_pytorch_model.bin"))
+                "unet",
+            )
+            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
+            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
 
     def test_no_safetensors_download_when_doing_pytorch(self):
         import ppdiffusers
@@ -1039,28 +966,25 @@ def test_no_safetensors_download_when_doing_pytorch(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             _ = StableDiffusionPipeline.from_pretrained(
                 "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
-                cache_dir=tmpdirname, )
+                cache_dir=tmpdirname,
+            )
             path = os.path.join(
                 tmpdirname,
                 "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
                 "snapshots",
                 "07838d72e12f9bcec1375b0482b80c1d399be843",
-                "unet", )
-            assert not os.path.exists(
-                os.path.join(path, "diffusion_pytorch_model.safetensors"))
-            assert os.path.exists(
-                os.path.join(path, "diffusion_pytorch_model.bin"))
+                "unet",
+            )
+            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
+            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
         ppdiffusers.utils.import_utils._safetensors_available = True
 
     def test_optional_components(self):
         unet = self.dummy_cond_unet()
-        pndm = PNDMScheduler.from_config(
-            "hf-internal-testing/tiny-stable-diffusion-torch",
-            subfolder="scheduler")
+        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         sd = StableDiffusionPipeline(
             unet=unet,
             scheduler=pndm,
@@ -1068,7 +992,8 @@ def test_optional_components(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=unet,
-            feature_extractor=self.dummy_extractor, )
+            feature_extractor=self.dummy_extractor,
+        )
         assert sd.config.requires_safety_checker is True
         with tempfile.TemporaryDirectory() as tmpdirname:
             sd.save_pretrained(tmpdirname)
@@ -1076,7 +1001,8 @@ def test_optional_components(self):
                 tmpdirname,
                 feature_extractor=None,
                 safety_checker=None,
-                requires_safety_checker=False, )
+                requires_safety_checker=False,
+            )
             assert sd.config.requires_safety_checker is False
             assert sd.config.safety_checker == (None, None)
             assert sd.config.feature_extractor == (None, None)
@@ -1092,8 +1018,7 @@ def test_optional_components(self):
                 config["safety_checker"] = [None, None]
             with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
                 json.dump(config, f)
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, requires_safety_checker=False)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False)
             sd.save_pretrained(tmpdirname)
             sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
             assert sd.config.requires_safety_checker is False
@@ -1110,8 +1035,7 @@ def test_optional_components(self):
             assert sd.config.safety_checker == (None, None)
             assert sd.config.feature_extractor == (None, None)
             sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, feature_extractor=self.dummy_extractor)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
             assert sd.config.requires_safety_checker is False
             assert sd.config.safety_checker == (None, None)
             assert sd.config.feature_extractor != (None, None)
@@ -1119,13 +1043,13 @@ def test_optional_components(self):
                 tmpdirname,
                 feature_extractor=self.dummy_extractor,
                 safety_checker=unet,
-                requires_safety_checker=[True, True], )
+                requires_safety_checker=[True, True],
+            )
             assert sd.config.requires_safety_checker == [True, True]
             assert sd.config.safety_checker != (None, None)
             assert sd.config.feature_extractor != (None, None)
             sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, feature_extractor=self.dummy_extractor)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
             assert sd.config.requires_safety_checker == [True, True]
             assert sd.config.safety_checker != (None, None)
             assert sd.config.feature_extractor != (None, None)
@@ -1146,42 +1070,28 @@ def tearDown(self):
     def test_smart_download(self):
         model_id = "hf-internal-testing/unet-pipeline-dummy"
         with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = DiffusionPipeline.from_pretrained(
-                model_id, cache_dir=tmpdirname, force_download=True)
+            _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
             local_repo_name = "--".join(["models"] + model_id.split("/"))
-            snapshot_dir = os.path.join(tmpdirname, local_repo_name,
-                                        "snapshots")
-            snapshot_dir = os.path.join(snapshot_dir,
-                                        os.listdir(snapshot_dir)[0])
-            assert os.path.isfile(
-                os.path.join(snapshot_dir, DiffusionPipeline.config_name))
+            snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
+            snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
+            assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name))
             assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME))
-            assert os.path.isfile(
-                os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
-            assert os.path.isfile(
-                os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME))
-            assert os.path.isfile(
-                os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
-            assert os.path.isfile(
-                os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
-            assert os.path.isfile(
-                os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
-            assert not os.path.isfile(
-                os.path.join(snapshot_dir, "big_array.npy"))
+            assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
+            assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
+            assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy"))
 
     def test_warning_unused_kwargs(self):
         model_id = "hf-internal-testing/unet-pipeline-dummy"
         logger = logging.get_logger("ppdiffusers.pipelines")
         with tempfile.TemporaryDirectory() as tmpdirname:
             with CaptureLogger(logger) as cap_logger:
-                DiffusionPipeline.from_pretrained(
-                    model_id,
-                    not_used=True,
-                    cache_dir=tmpdirname,
-                    force_download=True)
+                DiffusionPipeline.from_pretrained(model_id, not_used=True, cache_dir=tmpdirname, force_download=True)
         assert (
-            cap_logger.out.strip().split("\n")[-1] ==
-            "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
+            cap_logger.out.strip().split("\n")[-1]
+            == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
         )
 
     def test_from_save_pretrained(self):
@@ -1192,7 +1102,8 @@ def test_from_save_pretrained(self):
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"), )
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
         scheduler = DDPMScheduler(num_train_timesteps=10)
         ddpm = DDPMPipeline(model, scheduler)
         ddpm.set_progress_bar_config(disable=None)
@@ -1202,59 +1113,41 @@ def test_from_save_pretrained(self):
             new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
 
         generator = paddle.Generator().manual_seed(0)
-        image = ddpm(
-            generator=generator, num_inference_steps=5,
-            output_type="numpy").images
+        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
 
         generator = paddle.Generator().manual_seed(0)
-        new_image = new_ddpm(
-            generator=generator, num_inference_steps=5,
-            output_type="numpy").images
+        new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
 
-        assert (np.abs(image - new_image).sum() < 1e-5
-                ), "Models don't give the same forward pass"
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
 
     def test_from_pretrained_hub(self):
         model_path = "google/ddpm-cifar10-32"
         scheduler = DDPMScheduler(num_train_timesteps=10)
         ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
         ddpm.set_progress_bar_config(disable=None)
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(
-            model_path, scheduler=scheduler)
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
         ddpm_from_hub = ddpm_from_hub
         ddpm_from_hub.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = ddpm(
-            generator=generator, num_inference_steps=5,
-            output_type="numpy").images
+        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
-        new_image = ddpm_from_hub(
-            generator=generator, num_inference_steps=5,
-            output_type="numpy").images
-        assert (np.abs(image - new_image).sum() < 1e-05
-                ), "Models don't give the same forward pass"
+        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
+        assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass"
 
     def test_from_pretrained_hub_pass_model(self):
         model_path = "google/ddpm-cifar10-32"
         scheduler = DDPMScheduler(num_train_timesteps=10)
         unet = UNet2DModel.from_pretrained(model_path)
-        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(
-            model_path, unet=unet, scheduler=scheduler)
+        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
         ddpm_from_hub_custom_model = ddpm_from_hub_custom_model
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(
-            model_path, scheduler=scheduler)
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
-        image = ddpm_from_hub_custom_model(
-            generator=generator, num_inference_steps=5,
-            output_type="numpy").images
+        image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="numpy").images
         generator = paddle.Generator().manual_seed(0)
-        new_image = ddpm_from_hub(
-            generator=generator, num_inference_steps=5,
-            output_type="numpy").images
-        assert (np.abs(image - new_image).sum() < 1e-05
-                ), "Models don't give the same forward pass"
+        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
+        assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass"
 
     def test_output_format(self):
         model_path = "google/ddpm-cifar10-32"
@@ -1292,8 +1185,7 @@ def test_ddpm_ddim_equality_batched(self):
         ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
         ddim.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(seed)
-        ddpm_images = ddpm(
-            batch_size=2, generator=generator, output_type="numpy").images
+        ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images
         generator = paddle.Generator().manual_seed(seed)
         ddim_images = ddim(
             batch_size=2,
@@ -1301,5 +1193,6 @@ def test_ddpm_ddim_equality_batched(self):
             num_inference_steps=1000,
             eta=1.0,
             output_type="numpy",
-            use_clipped_model_output=True, ).images
+            use_clipped_model_output=True,
+        ).images
         assert np.abs(ddpm_images - ddim_images).max() < 0.1
diff --git a/ppdiffusers/tests/pipelines/test_pipelines_common.py b/ppdiffusers/tests/pipelines/test_pipelines_common.py
index 5b09ecc71d187..c92b77174f7dc 100644
--- a/ppdiffusers/tests/pipelines/test_pipelines_common.py
+++ b/ppdiffusers/tests/pipelines/test_pipelines_common.py
@@ -48,16 +48,18 @@ class PipelineTesterMixin:
     # Canonical parameters that are passed to `__call__` regardless
     # of the type of pipeline. They are always optional and have common
     # sense default values.
-    required_optional_params = frozenset([
-        "num_inference_steps",
-        "num_images_per_prompt",
-        "generator",
-        "latents",
-        "output_type",
-        "return_dict",
-        "callback",
-        "callback_steps",
-    ])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_images_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
     num_inference_steps_args = ["num_inference_steps"]
     test_attention_slicing = True
     test_cpu_offload = False
@@ -95,7 +97,8 @@ def params(self) -> frozenset:
             "do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline "
             "with non-configurable height and width arguments should set the attribute as "
             "`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. "
-            "See existing pipeline tests for reference.")
+            "See existing pipeline tests for reference."
+        )
 
     @property
     def batch_params(self) -> frozenset:
@@ -108,7 +111,8 @@ def batch_params(self) -> frozenset:
             "do not make modifications to the existing common sets of batch arguments. I.e. a text to "
             "image pipeline `negative_prompt` is not batched should set the attribute as "
             "`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. "
-            "See existing pipeline tests for reference.")
+            "See existing pipeline tests for reference."
+        )
 
     def tearDown(self):
         super().tearDown()
@@ -123,8 +127,7 @@ def test_save_load_local(self):
         output = pipe(**inputs)[0]
         with tempfile.TemporaryDirectory() as tmpdir:
             pipe.save_pretrained(tmpdir, to_diffusers=False)
-            pipe_loaded = self.pipeline_class.from_pretrained(
-                tmpdir, from_diffusers=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
             pipe_loaded.set_progress_bar_config(disable=None)
         inputs = self.get_dummy_inputs()
         output_loaded = pipe_loaded(**inputs)[0]
@@ -134,7 +137,8 @@ def test_save_load_local(self):
     def test_pipeline_call_signature(self):
         self.assertTrue(
             hasattr(self.pipeline_class, "__call__"),
-            f"{self.pipeline_class} should have a `__call__` method", )
+            f"{self.pipeline_class} should have a `__call__` method",
+        )
 
         parameters = inspect.signature(self.pipeline_class.__call__).parameters
 
@@ -146,9 +150,7 @@ def test_pipeline_call_signature(self):
 
         parameters = set(parameters.keys())
         parameters.remove("self")
-        parameters.discard(
-            "kwargs"
-        )  # kwargs can be added if arguments of pipeline call function are deprecated
+        parameters.discard("kwargs")  # kwargs can be added if arguments of pipeline call function are deprecated
 
         remaining_required_parameters = set()
 
@@ -176,9 +178,10 @@ def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]):
         self._test_inference_batch_consistent(batch_sizes=batch_sizes)
 
     def _test_inference_batch_consistent(
-            self,
-            batch_sizes=[2, 4, 13],
-            additional_params_copy_to_batched_inputs=["num_inference_steps"], ):
+        self,
+        batch_sizes=[2, 4, 13],
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe.set_progress_bar_config(disable=None)
@@ -191,10 +194,7 @@ def _test_inference_batch_consistent(
                 if name in self.batch_params:
                     if name == "prompt":
                         len_prompt = len(value)
-                        batched_inputs[name] = [
-                            value[:len_prompt // i]
-                            for i in range(1, batch_size + 1)
-                        ]
+                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
                         batched_inputs[name][-1] = 2000 * "very long"
                     else:
                         batched_inputs[name] = batch_size * [value]
@@ -220,13 +220,14 @@ def test_inference_batch_single_identical(self, batch_size=3):
         self._test_inference_batch_single_identical(batch_size=batch_size)
 
     def _test_inference_batch_single_identical(
-            self,
-            batch_size=3,
-            test_max_difference=None,
-            test_mean_pixel_difference=None,
-            relax_max_difference=False,
-            expected_max_diff=1e-4,
-            additional_params_copy_to_batched_inputs=["num_inference_steps"], ):
+        self,
+        batch_size=3,
+        test_max_difference=None,
+        test_mean_pixel_difference=None,
+        relax_max_difference=False,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
 
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
@@ -240,19 +241,14 @@ def _test_inference_batch_single_identical(
             if name in self.batch_params:
                 if name == "prompt":
                     len_prompt = len(value)
-                    batched_inputs[name] = [
-                        value[:len_prompt // i]
-                        for i in range(1, batch_size + 1)
-                    ]
+                    batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
                     batched_inputs[name][-1] = 2000 * "very long"
                 else:
                     batched_inputs[name] = batch_size * [value]
             elif name == "batch_size":
                 batched_inputs[name] = batch_size
             elif name == "generator":
-                batched_inputs[name] = [
-                    self.get_generator(i) for i in range(batch_size)
-                ]
+                batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
             else:
                 batched_inputs[name] = value
 
@@ -293,8 +289,7 @@ def test_components_function(self):
         init_components = self.get_dummy_components()
         pipe = self.pipeline_class(**init_components)
         self.assertTrue(hasattr(pipe, "components"))
-        self.assertTrue(
-            set(pipe.components.keys()) == set(init_components.keys()))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
 
     def test_float16_inference(self, expected_max_diff=1e-2):
         self._test_float16_inference(expected_max_diff)
@@ -312,7 +307,8 @@ def _test_float16_inference(self, expected_max_diff=1e-2):
         self.assertLess(
             max_diff,
             expected_max_diff,
-            "The outputs of the fp16 and fp32 pipelines are too different.", )
+            "The outputs of the fp16 and fp32 pipelines are too different.",
+        )
 
     def test_save_load_float16(self, expected_max_diff=1e-2):
         self._test_save_load_float16(expected_max_diff)
@@ -360,8 +356,7 @@ def test_save_load_optional_components(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             # TODO check this
             pipe.save_pretrained(tmpdir, to_diffusers=False)
-            pipe_loaded = self.pipeline_class.from_pretrained(
-                tmpdir, from_diffusers=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
             pipe_loaded.set_progress_bar_config(disable=None)
         for optional_component in pipe._optional_components:
             self.assertTrue(
@@ -394,27 +389,22 @@ def test_to_dtype(self):
         pipe = self.pipeline_class(**components)
         pipe.set_progress_bar_config(disable=None)
 
-        model_dtypes = [
-            component.dtype for component in components.values()
-            if hasattr(component, "dtype")
-        ]
+        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
         self.assertTrue(all(dtype == paddle.float32 for dtype in model_dtypes))
 
         pipe.to(paddle_dtype=paddle.float16)
-        model_dtypes = [
-            component.dtype for component in components.values()
-            if hasattr(component, "dtype")
-        ]
+        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
         self.assertTrue(all(dtype == paddle.float16 for dtype in model_dtypes))
 
     def test_attention_slicing_forward_pass(self):
         self._test_attention_slicing_forward_pass()
 
     def _test_attention_slicing_forward_pass(
-            self,
-            test_max_difference=True,
-            test_mean_pixel_difference=True,
-            expected_max_diff=5e-3, ):
+        self,
+        test_max_difference=True,
+        test_mean_pixel_difference=True,
+        expected_max_diff=5e-3,
+    ):
         if not self.test_attention_slicing:
             return
 
@@ -427,25 +417,24 @@ def _test_attention_slicing_forward_pass(
         inputs = self.get_dummy_inputs()
         output_with_slicing = pipe(**inputs)[0]
         if test_max_difference:
-            max_diff = np.abs(
-                to_np(output_with_slicing) - to_np(output_without_slicing)).max(
-                )
+            max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max()
             self.assertLess(
                 max_diff,
                 expected_max_diff,
-                "Attention slicing should not affect the inference results", )
+                "Attention slicing should not affect the inference results",
+            )
         if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_with_slicing[0],
-                                         output_without_slicing[0])
+            assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0])
 
     def test_xformers_attention_forwardGenerator_pass(self):
         self._test_xformers_attention_forwardGenerator_pass()
 
     def _test_xformers_attention_forwardGenerator_pass(
-            self,
-            test_max_difference=True,
-            test_mean_pixel_difference=True,
-            expected_max_diff=1e-2, ):
+        self,
+        test_max_difference=True,
+        test_mean_pixel_difference=True,
+        expected_max_diff=1e-2,
+    ):
         if not self.test_xformers_attention:
             return
         components = self.get_dummy_components()
@@ -461,15 +450,14 @@ def _test_xformers_attention_forwardGenerator_pass(
                 output_with_xformers = output_with_xformers.numpy()
             if hasattr(output_without_xformers, "numpy"):
                 output_without_xformers = output_without_xformers.numpy()
-            max_diff = np.abs(output_with_xformers -
-                              output_without_xformers).max()
+            max_diff = np.abs(output_with_xformers - output_without_xformers).max()
             self.assertLess(
                 max_diff,
                 expected_max_diff,
-                "XFormers attention should not affect the inference results", )
+                "XFormers attention should not affect the inference results",
+            )
         if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_with_xformers[0],
-                                         output_without_xformers[0])
+            assert_mean_pixel_difference(output_with_xformers[0], output_without_xformers[0])
 
     def test_progress_bar(self):
         components = self.get_dummy_components()
@@ -482,12 +470,12 @@ def test_progress_bar(self):
             self.assertTrue(max_steps is not None and len(max_steps) > 0)
             self.assertTrue(
                 f"{max_steps}/{max_steps}" in stderr,
-                "Progress bar should be enabled and stopped at the max step", )
+                "Progress bar should be enabled and stopped at the max step",
+            )
         pipe.set_progress_bar_config(disable=True)
         with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
             _ = pipe(**inputs)
-            self.assertTrue(stderr.getvalue() == "",
-                            "Progress bar should be disabled")
+            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
 
     def test_num_images_per_prompt(self):
         sig = inspect.signature(self.pipeline_class.__call__)
@@ -510,17 +498,13 @@ def test_num_images_per_prompt(self):
                     if key in self.batch_params:
                         inputs[key] = batch_size * [inputs[key]]
 
-                images = pipe(
-                    **inputs,
-                    num_images_per_prompt=num_images_per_prompt).images
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
 
                 assert images.shape[0] == batch_size * num_images_per_prompt
 
 
 def assert_mean_pixel_difference(image, expected_image):
-    image = np.asarray(
-        DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
-    expected_image = np.asarray(
-        DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
+    image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
+    expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
     avg_diff = np.abs(image - expected_image).mean()
     assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
index 23825d0855c71..b6cb10d5a3545 100644
--- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
@@ -19,9 +19,13 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (AutoencoderKL, DDIMScheduler,
-                         DPMSolverMultistepScheduler, TextToVideoSDPipeline,
-                         UNet3DConditionModel)
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    TextToVideoSDPipeline,
+    UNet3DConditionModel,
+)
 from ppdiffusers.utils import load_numpy, slow
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -32,14 +36,16 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = TextToVideoSDPipeline
     params = TEXT_TO_IMAGE_PARAMS
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = frozenset([
-        "num_inference_steps",
-        "generator",
-        "latents",
-        "return_dict",
-        "callback",
-        "callback_steps",
-    ])
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
 
     def get_dummy_components(self):
         paddle.seed(0)
@@ -53,20 +59,24 @@ def get_dummy_components(self):
                 "CrossAttnDownBlock3D",
                 "CrossAttnDownBlock3D",
                 "CrossAttnDownBlock3D",
-                "DownBlock3D", ),
+                "DownBlock3D",
+            ),
             up_block_types=(
                 "UpBlock3D",
                 "CrossAttnUpBlock3D",
                 "CrossAttnUpBlock3D",
-                "CrossAttnUpBlock3D", ),
+                "CrossAttnUpBlock3D",
+            ),
             cross_attention_dim=32,
-            attention_head_dim=4, )
+            attention_head_dim=4,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
             beta_schedule="scaled_linear",
             clip_sample=False,
-            set_alpha_to_one=False, )
+            set_alpha_to_one=False,
+        )
         paddle.seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -75,7 +85,8 @@ def get_dummy_components(self):
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
-            sample_size=128, )
+            sample_size=128,
+        )
         paddle.seed(0)
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -88,10 +99,10 @@ def get_dummy_components(self):
             pad_token_id=1,
             vocab_size=1000,
             hidden_act="gelu",
-            projection_dim=512, )
+            projection_dim=512,
+        )
         text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         components = {
             "unet": unet,
             "scheduler": scheduler,
@@ -128,28 +139,20 @@ def test_text_to_video_default_case(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
 
     def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(
-            test_mean_pixel_difference=False)
+        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
 
     def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(
-            test_mean_pixel_difference=False)
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
 
-    @unittest.skip(
-        reason="Batching needs to be properly figured out first for this pipeline."
-    )
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
     def test_inference_batch_consistent(self):
         pass
 
-    @unittest.skip(
-        reason="Batching needs to be properly figured out first for this pipeline."
-    )
+    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
     def test_inference_batch_single_identical(self):
         pass
 
-    @unittest.skip(
-        reason="`num_images_per_prompt` argument is not supported for this pipeline."
-    )
+    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
     def test_num_images_per_prompt(self):
         pass
 
@@ -161,19 +164,13 @@ def test_full_model(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy"
         )
         pipe = TextToVideoSDPipeline.from_pretrained(
-            "damo-vilab/text-to-video-ms-1.7b",
-            from_hf_hub=True,
-            from_diffusers=True)
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-            pipe.scheduler.config)
+            "damo-vilab/text-to-video-ms-1.7b", from_hf_hub=True, from_diffusers=True
+        )
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
         pipe = pipe
         prompt = "Spiderman is surfing"
         generator = paddle.Generator().manual_seed(0)
-        video_frames = pipe(
-            prompt,
-            generator=generator,
-            num_inference_steps=25,
-            output_type="pd").frames
+        video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pd").frames
         video = video_frames.cpu().numpy()
         assert np.abs(expected_video - video).mean() < 0.8
 
@@ -181,15 +178,10 @@ def test_two_step_model(self):
         expected_video = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
         )
-        pipe = TextToVideoSDPipeline.from_pretrained(
-            "damo-vilab/text-to-video-ms-1.7b")
+        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
         pipe = pipe
         prompt = "Spiderman is surfing"
         generator = paddle.Generator().manual_seed(0)
-        video_frames = pipe(
-            prompt,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="pd").frames
+        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pd").frames
         video = video_frames.cpu().numpy()
         assert np.abs(expected_video - video).mean() < 0.8
diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
index 8387b54267696..121798ea45e07 100644
--- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
+++ b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
@@ -27,8 +27,7 @@
 class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
     def test_full_model(self):
         model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = TextToVideoZeroPipeline.from_pretrained(
-            model_id, torch_dtype="float16")
+        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype="float16")
         pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         generator = paddle.Generator().manual_seed(0)
         prompt = "A bear is playing a guitar on Times Square"
diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip.py b/ppdiffusers/tests/pipelines/unclip/test_unclip.py
index 3f0b1a190c645..3e8d64094abd3 100644
--- a/ppdiffusers/tests/pipelines/unclip/test_unclip.py
+++ b/ppdiffusers/tests/pipelines/unclip/test_unclip.py
@@ -18,18 +18,25 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModelWithProjection,
-                                    CLIPTokenizer)
-
-from ppdiffusers import (PriorTransformer, UnCLIPPipeline, UnCLIPScheduler,
-                         UNet2DConditionModel, UNet2DModel)
+from paddlenlp.transformers import (
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+from ppdiffusers import (
+    PriorTransformer,
+    UnCLIPPipeline,
+    UnCLIPScheduler,
+    UNet2DConditionModel,
+    UNet2DModel,
+)
 from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from ppdiffusers.utils import slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (PipelineTesterMixin,
-                                     assert_mean_pixel_difference)
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@@ -44,13 +51,15 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "cross_attention_kwargs",
     }
     batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = frozenset([
-        "generator",
-        "return_dict",
-        "prior_num_inference_steps",
-        "decoder_num_inference_steps",
-        "super_res_num_inference_steps",
-    ])
+    required_optional_params = frozenset(
+        [
+            "generator",
+            "return_dict",
+            "prior_num_inference_steps",
+            "decoder_num_inference_steps",
+            "super_res_num_inference_steps",
+        ]
+    )
     test_xformers_attention = False
 
     @property
@@ -75,8 +84,7 @@ def cross_attention_dim(self):
 
     @property
     def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         return tokenizer
 
     @property
@@ -92,7 +100,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModelWithProjection(config)
 
     @property
@@ -127,13 +136,14 @@ def dummy_decoder(self):
             "out_channels": 6,
             "down_block_types": (
                 "ResnetDownsampleBlock2D",
-                "SimpleCrossAttnDownBlock2D", ),
-            "up_block_types":
-            ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+                "SimpleCrossAttnDownBlock2D",
+            ),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
             "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
             "block_out_channels": (
                 self.block_out_channels_0,
-                self.block_out_channels_0 * 2, ),
+                self.block_out_channels_0 * 2,
+            ),
             "layers_per_block": 1,
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
@@ -148,13 +158,12 @@ def dummy_super_res_kwargs(self):
         return {
             "sample_size": 64,
             "layers_per_block": 1,
-            "down_block_types":
-            ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
-            "up_block_types":
-            ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
             "block_out_channels": (
                 self.block_out_channels_0,
-                self.block_out_channels_0 * 2, ),
+                self.block_out_channels_0 * 2,
+            ),
             "in_channels": 6,
             "out_channels": 3,
         }
@@ -183,15 +192,18 @@ def get_dummy_components(self):
             variance_type="fixed_small_log",
             prediction_type="sample",
             num_train_timesteps=1000,
-            clip_sample_range=5.0, )
+            clip_sample_range=5.0,
+        )
         decoder_scheduler = UnCLIPScheduler(
             variance_type="learned_range",
             prediction_type="epsilon",
-            num_train_timesteps=1000, )
+            num_train_timesteps=1000,
+        )
         super_res_scheduler = UnCLIPScheduler(
             variance_type="fixed_small_log",
             prediction_type="epsilon",
-            num_train_timesteps=1000, )
+            num_train_timesteps=1000,
+        )
         components = {
             "prior": prior,
             "decoder": decoder,
@@ -229,20 +241,21 @@ def test_unclip(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            2.6383996e-04,
-            9.9658674e-01,
-            1.1275411e-03,
-            2.6383996e-04,
-            2.6383996e-04,
-            9.9702907e-01,
-            9.9973619e-01,
-            9.9545717e-01,
-            2.6383996e-04,
-        ])
+        expected_slice = np.array(
+            [
+                2.6383996e-04,
+                9.9658674e-01,
+                1.1275411e-03,
+                2.6383996e-04,
+                2.6383996e-04,
+                9.9702907e-01,
+                9.9973619e-01,
+                9.9545717e-01,
+                2.6383996e-04,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_unclip_passed_text_embed(self):
         class DummyScheduler:
@@ -264,29 +277,34 @@ class DummyScheduler:
             dtype=dtype,
             generator=generator,
             latents=None,
-            scheduler=DummyScheduler(), )
+            scheduler=DummyScheduler(),
+        )
         shape = (
             batch_size,
             decoder.config.in_channels,
             decoder.config.sample_size,
-            decoder.config.sample_size, )
+            decoder.config.sample_size,
+        )
         decoder_latents = pipe.prepare_latents(
             shape,
             dtype=dtype,
             generator=generator,
             latents=None,
-            scheduler=DummyScheduler(), )
+            scheduler=DummyScheduler(),
+        )
         shape = (
             batch_size,
             super_res_first.config.in_channels // 2,
             super_res_first.config.sample_size,
-            super_res_first.config.sample_size, )
+            super_res_first.config.sample_size,
+        )
         super_res_latents = pipe.prepare_latents(
             shape,
             dtype=dtype,
             generator=generator,
             latents=None,
-            scheduler=DummyScheduler(), )
+            scheduler=DummyScheduler(),
+        )
         pipe.set_progress_bar_config(disable=None)
         prompt = "this is a prompt example"
         generator = paddle.Generator().manual_seed(0)
@@ -299,14 +317,16 @@ class DummyScheduler:
             prior_latents=prior_latents,
             decoder_latents=decoder_latents,
             super_res_latents=super_res_latents,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images
         text_inputs = tokenizer(
             prompt,
             padding="max_length",
             max_length=tokenizer.model_max_length,
             return_attention_mask=True,
-            return_tensors="pd", )
+            return_tensors="pd",
+        )
         text_model_output = text_encoder(text_inputs.input_ids)
         text_attention_mask = text_inputs.attention_mask
         generator = paddle.Generator().manual_seed(0)
@@ -320,13 +340,13 @@ class DummyScheduler:
             super_res_latents=super_res_latents,
             text_model_output=text_model_output,
             text_attention_mask=text_attention_mask,
-            output_type="np", )[0]
+            output_type="np",
+        )[0]
         assert np.abs(image - image_from_text).max() < 0.0001
 
     def test_attention_slicing_forward_pass(self):
         test_max_difference = False
-        self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference, expected_max_diff=0.01)
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
 
     def test_inference_batch_single_identical(self):
         test_max_difference = False
@@ -365,8 +385,7 @@ def tearDown(self):
 
     def test_unclip_karlo(self):
         # Hard code image
-        expected_image = np.array([[0.73281264, 0.69175875, 0.64672112],
-                                   [0.71919304, 0.65395129, 0.60436499]])
+        expected_image = np.array([[0.73281264, 0.69175875, 0.64672112], [0.71919304, 0.65395129, 0.60436499]])
         pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
         pipeline.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
index 2bbb56cfad604..e09f906a7f87d 100644
--- a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -20,32 +20,41 @@
 import numpy as np
 import paddle
 from paddlenlp.transformers import (
-    CLIPImageProcessor, CLIPTextConfig, CLIPTextModelWithProjection,
-    CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection)
-
-from ppdiffusers import (DiffusionPipeline, UnCLIPImageVariationPipeline,
-                         UnCLIPScheduler, UNet2DConditionModel, UNet2DModel)
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ppdiffusers import (
+    DiffusionPipeline,
+    UnCLIPImageVariationPipeline,
+    UnCLIPScheduler,
+    UNet2DConditionModel,
+    UNet2DModel,
+)
 from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from ppdiffusers.utils import floats_tensor, slow
 from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu
 
-from ..pipeline_params import (IMAGE_VARIATION_BATCH_PARAMS,
-                               IMAGE_VARIATION_PARAMS)
-from ..test_pipelines_common import (PipelineTesterMixin,
-                                     assert_mean_pixel_difference)
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
-class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin,
-                                            unittest.TestCase):
+class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = UnCLIPImageVariationPipeline
     params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
     batch_params = IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = frozenset([
-        "generator",
-        "return_dict",
-        "decoder_num_inference_steps",
-        "super_res_num_inference_steps",
-    ])
+    required_optional_params = frozenset(
+        [
+            "generator",
+            "return_dict",
+            "decoder_num_inference_steps",
+            "super_res_num_inference_steps",
+        ]
+    )
     test_xformers_attention = False
 
     @property
@@ -70,8 +79,7 @@ def cross_attention_dim(self):
 
     @property
     def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         return tokenizer
 
     @property
@@ -87,7 +95,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModelWithProjection(config)
 
     @property
@@ -100,7 +109,8 @@ def dummy_image_encoder(self):
             num_attention_heads=4,
             image_size=32,
             intermediate_size=37,
-            patch_size=1, )
+            patch_size=1,
+        )
         return CLIPVisionModelWithProjection(config)
 
     @property
@@ -123,13 +133,14 @@ def dummy_decoder(self):
             "out_channels": 6,
             "down_block_types": (
                 "ResnetDownsampleBlock2D",
-                "SimpleCrossAttnDownBlock2D", ),
-            "up_block_types":
-            ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+                "SimpleCrossAttnDownBlock2D",
+            ),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
             "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
             "block_out_channels": (
                 self.block_out_channels_0,
-                self.block_out_channels_0 * 2, ),
+                self.block_out_channels_0 * 2,
+            ),
             "layers_per_block": 1,
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
@@ -144,13 +155,12 @@ def dummy_super_res_kwargs(self):
         return {
             "sample_size": 64,
             "layers_per_block": 1,
-            "down_block_types":
-            ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
-            "up_block_types":
-            ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
             "block_out_channels": (
                 self.block_out_channels_0,
-                self.block_out_channels_0 * 2, ),
+                self.block_out_channels_0 * 2,
+            ),
             "in_channels": 6,
             "out_channels": 3,
         }
@@ -177,11 +187,13 @@ def get_dummy_components(self):
         decoder_scheduler = UnCLIPScheduler(
             variance_type="learned_range",
             prediction_type="epsilon",
-            num_train_timesteps=1000, )
+            num_train_timesteps=1000,
+        )
         super_res_scheduler = UnCLIPScheduler(
             variance_type="fixed_small_log",
             prediction_type="epsilon",
-            num_train_timesteps=1000, )
+            num_train_timesteps=1000,
+        )
         feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
         image_encoder = self.dummy_image_encoder
         return {
@@ -207,8 +219,7 @@ def get_dummy_inputs(self, seed=0, pil_image=True):
         if pil_image:
             input_image = input_image * 0.5 + 0.5
             input_image = input_image.clip(min=0, max=1)
-            input_image = (input_image.cpu().transpose(
-                perm=[0, 2, 3, 1]).cast("float32").numpy())
+            input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy()
             input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
         return {
             "image": input_image,
@@ -230,20 +241,21 @@ def test_unclip_image_variation_input_tensor(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            2.7585030e-03,
-            2.6383996e-04,
-            9.9801058e-01,
-            2.6383996e-04,
-            9.9531418e-01,
-            9.9220645e-01,
-            3.6702752e-03,
-            9.9970925e-01,
-            9.9973619e-01,
-        ])
+        expected_slice = np.array(
+            [
+                2.7585030e-03,
+                2.6383996e-04,
+                9.9801058e-01,
+                2.6383996e-04,
+                9.9531418e-01,
+                9.9220645e-01,
+                3.6702752e-03,
+                9.9970925e-01,
+                9.9973619e-01,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_unclip_image_variation_input_image(self):
         components = self.get_dummy_components()
@@ -257,28 +269,28 @@ def test_unclip_image_variation_input_image(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([
-            5.2168965e-04,
-            9.9861604e-01,
-            9.9755847e-01,
-            9.9804187e-01,
-            9.9411416e-01,
-            9.9248302e-01,
-            9.9973619e-01,
-            9.9777901e-01,
-            9.9973619e-01,
-        ])
+        expected_slice = np.array(
+            [
+                5.2168965e-04,
+                9.9861604e-01,
+                9.9755847e-01,
+                9.9804187e-01,
+                9.9411416e-01,
+                9.9248302e-01,
+                9.9973619e-01,
+                9.9777901e-01,
+                9.9973619e-01,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_unclip_image_variation_input_list_images(self):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe.set_progress_bar_config(disable=None)
         pipeline_inputs = self.get_dummy_inputs(pil_image=True)
-        pipeline_inputs[
-            "image"] = [pipeline_inputs["image"], pipeline_inputs["image"]]
+        pipeline_inputs["image"] = [pipeline_inputs["image"], pipeline_inputs["image"]]
         output = pipe(**pipeline_inputs)
         image = output.images
         tuple_pipeline_inputs = self.get_dummy_inputs(pil_image=True)
@@ -290,20 +302,21 @@ def test_unclip_image_variation_input_list_images(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array([
-            5.2201748e-04,
-            9.9861759e-01,
-            9.9755961e-01,
-            9.9804127e-01,
-            9.9411547e-01,
-            9.9248385e-01,
-            9.9973619e-01,
-            9.9777836e-01,
-            9.9973619e-01,
-        ])
+        expected_slice = np.array(
+            [
+                5.2201748e-04,
+                9.9861759e-01,
+                9.9755961e-01,
+                9.9804127e-01,
+                9.9411547e-01,
+                9.9248385e-01,
+                9.9973619e-01,
+                9.9777836e-01,
+                9.9973619e-01,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_unclip_passed_image_embed(self):
         class DummyScheduler:
@@ -319,29 +332,34 @@ class DummyScheduler:
             batch_size,
             pipe.decoder.config.in_channels,
             pipe.decoder.config.sample_size,
-            pipe.decoder.config.sample_size, )
+            pipe.decoder.config.sample_size,
+        )
         decoder_latents = pipe.prepare_latents(
             shape,
             dtype=dtype,
             generator=generator,
             latents=None,
-            scheduler=DummyScheduler(), )
+            scheduler=DummyScheduler(),
+        )
         shape = (
             batch_size,
             pipe.super_res_first.config.in_channels // 2,
             pipe.super_res_first.config.sample_size,
-            pipe.super_res_first.config.sample_size, )
+            pipe.super_res_first.config.sample_size,
+        )
         super_res_latents = pipe.prepare_latents(
             shape,
             dtype=dtype,
             generator=generator,
             latents=None,
-            scheduler=DummyScheduler(), )
+            scheduler=DummyScheduler(),
+        )
         pipeline_inputs = self.get_dummy_inputs(pil_image=False)
         img_out_1 = pipe(
             **pipeline_inputs,
             decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents, ).images
+            super_res_latents=super_res_latents,
+        ).images
         pipeline_inputs = self.get_dummy_inputs(pil_image=False)
         image = pipeline_inputs.pop("image")
         image_embeddings = pipe.image_encoder(image).image_embeds
@@ -349,7 +367,8 @@ class DummyScheduler:
             **pipeline_inputs,
             decoder_latents=decoder_latents,
             super_res_latents=super_res_latents,
-            image_embeddings=image_embeddings, ).images
+            image_embeddings=image_embeddings,
+        ).images
         assert np.abs(img_out_1 - img_out_2).max() < 0.0001
 
     def test_attention_slicing_forward_pass(self):
@@ -358,8 +377,8 @@ def test_attention_slicing_forward_pass(self):
         expected_max_diff = 1e-2
 
         self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference,
-            expected_max_diff=expected_max_diff)
+            test_max_difference=test_max_difference, expected_max_diff=expected_max_diff
+        )
 
     def test_inference_batch_single_identical(self):
         test_max_difference = False
@@ -398,11 +417,9 @@ def test_unclip_image_variation_karlo(self):
         input_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
         )
-        expected_image = np.array([[0.09096909, 0.13343304, 0.26244187],
-                                   [0.15095001, 0.19459972, 0.3182609]])
+        expected_image = np.array([[0.09096909, 0.13343304, 0.26244187], [0.15095001, 0.19459972, 0.3182609]])
         # TODO(wugaosheng): test this function
-        pipeline = UnCLIPImageVariationPipeline.from_pretrained(
-            "kakaobrain/karlo-v1-alpha-image-variations")
+        pipeline = UnCLIPImageVariationPipeline.from_pretrained("kakaobrain/karlo-v1-alpha-image-variations")
         pipeline.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
         output = pipeline(input_image, generator=generator, output_type="np")
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
index 35b1372d082b8..c3906861b23a7 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
@@ -21,8 +21,7 @@
 import paddle
 
 from ppdiffusers import VersatileDiffusionDualGuidedPipeline
-from ppdiffusers.utils.testing_utils import (load_image, nightly,
-                                             require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import load_image, nightly, require_paddle_gpu
 
 
 @nightly
@@ -34,8 +33,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
-            "shi-labs/versatile-diffusion")
+        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
         pipe.remove_unused_weights()
         pipe.set_progress_bar_config(disable=None)
         second_prompt = load_image(
@@ -49,11 +47,11 @@ def test_remove_unused_weights_save_load(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=2,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
-                tmpdirname, from_diffusers=False)
+            pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname, from_diffusers=False)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
         new_image = pipe(
@@ -63,13 +61,12 @@ def test_remove_unused_weights_save_load(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=2,
-            output_type="numpy", ).images
-        assert (np.abs(image - new_image).sum() < 1e-05
-                ), "Models don't have the same forward pass"
+            output_type="numpy",
+        ).images
+        assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
 
     def test_inference_dual_guided(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
-            "shi-labs/versatile-diffusion")
+        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
         pipe.remove_unused_weights()
         pipe.set_progress_bar_config(disable=None)
         first_prompt = "cyberpunk 2077"
@@ -84,18 +81,21 @@ def test_inference_dual_guided(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=50,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.01500076,
-            0.01142624,
-            0.01418972,
-            0.01518875,
-            0.01114869,
-            0.01190853,
-            0.02978998,
-            0.02376354,
-            0.02396089,
-        ])
+        expected_slice = np.array(
+            [
+                0.01500076,
+                0.01142624,
+                0.01418972,
+                0.01518875,
+                0.01114869,
+                0.01190853,
+                0.02978998,
+                0.02376354,
+                0.02396089,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
index fbc38ee9f49a1..8335bdf260d7a 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
@@ -19,8 +19,7 @@
 import paddle
 
 from ppdiffusers import VersatileDiffusionImageVariationPipeline
-from ppdiffusers.utils.testing_utils import (load_image, require_paddle_gpu,
-                                             slow)
+from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu, slow
 
 
 class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
@@ -29,11 +28,9 @@ class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
 
 @slow
 @require_paddle_gpu
-class VersatileDiffusionImageVariationPipelineIntegrationTests(
-        unittest.TestCase):
+class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
     def test_inference_image_variations(self):
-        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
-            "shi-labs/versatile-diffusion")
+        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
         pipe.set_progress_bar_config(disable=None)
         image_prompt = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
@@ -44,18 +41,21 @@ def test_inference_image_variations(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=50,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([
-            0.12047189,
-            0.19138041,
-            0.22884357,
-            0.08833978,
-            0.1594424,
-            0.16826832,
-            0.07032129,
-            0.14926612,
-            0.12981007,
-        ])
+        expected_slice = np.array(
+            [
+                0.12047189,
+                0.19138041,
+                0.22884357,
+                0.08833978,
+                0.1594424,
+                0.16826832,
+                0.07032129,
+                0.14926612,
+                0.12981007,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
index ed49997b5a89b..aab7e81ba0c40 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
@@ -21,8 +21,7 @@
 import paddle
 
 from ppdiffusers import VersatileDiffusionPipeline
-from ppdiffusers.utils.testing_utils import (load_image, nightly,
-                                             require_paddle_gpu)
+from ppdiffusers.utils.testing_utils import load_image, nightly, require_paddle_gpu
 
 
 class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase):
@@ -38,8 +37,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_from_save_pretrained(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained(
-            "shi-labs/versatile-diffusion")
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion")
         pipe.set_progress_bar_config(disable=None)
         prompt_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
@@ -52,11 +50,11 @@ def test_from_save_pretrained(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=2,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionPipeline.from_pretrained(
-                tmpdirname, from_diffusers=False)
+            pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
         new_image = pipe.dual_guided(
@@ -66,13 +64,12 @@ def test_from_save_pretrained(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=2,
-            output_type="numpy", ).images
-        assert (np.abs(image - new_image).sum() < 1e-05
-                ), "Models don't have the same forward pass"
+            output_type="numpy",
+        ).images
+        assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
 
     def test_inference_dual_guided_then_text_to_image(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained(
-            "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16)
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", paddle_dtype=paddle.float16)
         pipe.set_progress_bar_config(disable=None)
         prompt = "cyberpunk 2077"
         init_image = load_image(
@@ -86,21 +83,24 @@ def test_inference_dual_guided_then_text_to_image(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=50,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 512, 512, 3)
         # expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001])
-        expected_slice = np.array([
-            0.03100586,
-            0.02929688,
-            0.03271484,
-            0.02807617,
-            0.02905273,
-            0.03173828,
-            0.02685547,
-            0.02807617,
-            0.03271484,
-        ])
+        expected_slice = np.array(
+            [
+                0.03100586,
+                0.02929688,
+                0.03271484,
+                0.02807617,
+                0.02905273,
+                0.03173828,
+                0.02685547,
+                0.02807617,
+                0.03271484,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
         prompt = "A painting of a squirrel eating a burger "
         generator = paddle.Generator().manual_seed(0)
@@ -109,36 +109,40 @@ def test_inference_dual_guided_then_text_to_image(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=50,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 512, 512, 3)
         # expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.387, 0.479, 0.3796, 0.4009, 0.4878, 0.4778])
-        expected_slice = np.array([
-            0.0390625,
-            0.00854492,
-            0.0,
-            0.03930664,
-            0.00878906,
-            0.04711914,
-            0.03686523,
-            0.0,
-            0.0246582,
-        ])
+        expected_slice = np.array(
+            [
+                0.0390625,
+                0.00854492,
+                0.0,
+                0.03930664,
+                0.00878906,
+                0.04711914,
+                0.03686523,
+                0.0,
+                0.0246582,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-        image = pipe.image_variation(
-            init_image, generator=generator, output_type="numpy").images
+        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 512, 512, 3)
         # expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.377, 0.3894, 0.4297, 0.4331, 0.4456])
-        expected_slice = np.array([
-            0.34472656,
-            0.1940918,
-            0.10546875,
-            0.38134766,
-            0.24560547,
-            0.13208008,
-            0.38867188,
-            0.30566406,
-            0.18188477,
-        ])
+        expected_slice = np.array(
+            [
+                0.34472656,
+                0.1940918,
+                0.10546875,
+                0.38134766,
+                0.24560547,
+                0.13208008,
+                0.38867188,
+                0.30566406,
+                0.18188477,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
index fbe47142eafcb..c95b30030f3d5 100644
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
@@ -37,8 +37,7 @@ def tearDown(self):
         paddle.device.cuda.empty_cache()
 
     def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-            "shi-labs/versatile-diffusion")
+        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
         pipe.remove_unused_weights()
         pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger "
@@ -48,11 +47,11 @@ def test_remove_unused_weights_save_load(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=2,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-                tmpdirname, from_diffusers=False)
+            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname, from_diffusers=False)
         pipe.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
         new_image = pipe(
@@ -60,13 +59,12 @@ def test_remove_unused_weights_save_load(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=2,
-            output_type="numpy", ).images
-        assert (np.abs(image - new_image).sum() < 1e-05
-                ), "Models don't have the same forward pass"
+            output_type="numpy",
+        ).images
+        assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
 
     def test_inference_text2img(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-            "shi-labs/versatile-diffusion")
+        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
         pipe.set_progress_bar_config(disable=None)
         prompt = "A painting of a squirrel eating a burger "
         generator = paddle.Generator().manual_seed(0)
@@ -75,19 +73,22 @@ def test_inference_text2img(self):
             generator=generator,
             guidance_scale=7.5,
             num_inference_steps=50,
-            output_type="numpy", ).images
+            output_type="numpy",
+        ).images
         image_slice = image[0, 253:256, 253:256, -1]
         assert image.shape == (1, 512, 512, 3)
         # expected_slice = np.array([0.3493, 0.3757, 0.4093, 0.4495, 0.4233, 0.4102, 0.4507, 0.4756, 0.4787])
-        expected_slice = np.array([
-            0.0390625,
-            0.00854492,
-            0.0,
-            0.03930664,
-            0.00878906,
-            0.04711914,
-            0.03686523,
-            0.0,
-            0.0246582,
-        ])
+        expected_slice = np.array(
+            [
+                0.0390625,
+                0.00854492,
+                0.0,
+                0.03930664,
+                0.00878906,
+                0.04711914,
+                0.03686523,
+                0.0,
+                0.0246582,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
index 5c65fd95fc95f..c17b7fd1d7257 100644
--- a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -20,10 +20,15 @@
 import paddle
 from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
-from ppdiffusers import (Transformer2DModel, VQDiffusionPipeline,
-                         VQDiffusionScheduler, VQModel)
-from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import \
-    LearnedClassifierFreeSamplingEmbeddings
+from ppdiffusers import (
+    Transformer2DModel,
+    VQDiffusionPipeline,
+    VQDiffusionScheduler,
+    VQModel,
+)
+from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import (
+    LearnedClassifierFreeSamplingEmbeddings,
+)
 from ppdiffusers.utils import load_numpy, slow
 from ppdiffusers.utils.testing_utils import require_paddle_gpu
 
@@ -57,13 +62,13 @@ def dummy_vqvae(self):
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=3,
             num_vq_embeddings=self.num_embed,
-            vq_embed_dim=3, )
+            vq_embed_dim=3,
+        )
         return model
 
     @property
     def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-clip")
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
         return tokenizer
 
     @property
@@ -78,7 +83,8 @@ def dummy_text_encoder(self):
             num_attention_heads=4,
             num_hidden_layers=5,
             pad_token_id=1,
-            vocab_size=1000, )
+            vocab_size=1000,
+        )
         return CLIPTextModel(config).eval()
 
     @property
@@ -106,8 +112,7 @@ def test_vq_diffusion(self):
         tokenizer = self.dummy_tokenizer
         transformer = self.dummy_transformer
         scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = (
-            LearnedClassifierFreeSamplingEmbeddings(learnable=False))
+        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False)
         pipe = VQDiffusionPipeline(
             vqvae=vqvae,
             text_encoder=text_encoder,
@@ -119,11 +124,7 @@ def test_vq_diffusion(self):
         pipe.set_progress_bar_config(disable=None)
         prompt = "teddy bear playing in the pool"
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np")
+        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = pipe(
@@ -131,24 +132,26 @@ def test_vq_diffusion(self):
             generator=generator,
             output_type="np",
             return_dict=False,
-            num_inference_steps=2, )[0]
+            num_inference_steps=2,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 24, 24, 3)
-        expected_slice = np.array([
-            0.5900591,
-            0.83443725,
-            0.4418438,
-            0.604656,
-            0.89781034,
-            0.40088692,
-            0.6107253,
-            0.87849474,
-            0.64088374,
-        ])
+        expected_slice = np.array(
+            [
+                0.5900591,
+                0.83443725,
+                0.4418438,
+                0.604656,
+                0.89781034,
+                0.40088692,
+                0.6107253,
+                0.87849474,
+                0.64088374,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
     def test_vq_diffusion_classifier_free_sampling(self):
         vqvae = self.dummy_vqvae
@@ -156,11 +159,11 @@ def test_vq_diffusion_classifier_free_sampling(self):
         tokenizer = self.dummy_tokenizer
         transformer = self.dummy_transformer
         scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = (
-            LearnedClassifierFreeSamplingEmbeddings(
-                learnable=True,
-                hidden_size=self.text_embedder_hidden_size,
-                length=tokenizer.model_max_length, ))
+        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(
+            learnable=True,
+            hidden_size=self.text_embedder_hidden_size,
+            length=tokenizer.model_max_length,
+        )
         pipe = VQDiffusionPipeline(
             vqvae=vqvae,
             text_encoder=text_encoder,
@@ -172,11 +175,7 @@ def test_vq_diffusion_classifier_free_sampling(self):
         pipe.set_progress_bar_config(disable=None)
         prompt = "teddy bear playing in the pool"
         generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np")
+        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
         image = output.images
         generator = paddle.Generator().manual_seed(0)
         image_from_tuple = pipe(
@@ -184,24 +183,26 @@ def test_vq_diffusion_classifier_free_sampling(self):
             generator=generator,
             output_type="np",
             return_dict=False,
-            num_inference_steps=2, )[0]
+            num_inference_steps=2,
+        )[0]
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
         assert image.shape == (1, 24, 24, 3)
-        expected_slice = np.array([
-            0.61711097,
-            0.8419658,
-            0.5493732,
-            0.64064896,
-            0.97944254,
-            0.5611503,
-            0.6145399,
-            0.7063037,
-            0.54406035,
-        ])
+        expected_slice = np.array(
+            [
+                0.61711097,
+                0.8419658,
+                0.5493732,
+                0.64064896,
+                0.97944254,
+                0.5611503,
+                0.6145399,
+                0.7063037,
+                0.54406035,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max(
-        ) < 0.01
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
 
 
 @slow
@@ -216,8 +217,7 @@ def test_vq_diffusion_classifier_free_sampling(self):
         expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy"
         )
-        pipeline = VQDiffusionPipeline.from_pretrained(
-            "microsoft/vq-diffusion-ithq")
+        pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
         pipeline = pipeline
         pipeline.set_progress_bar_config(disable=None)
         generator = paddle.Generator().manual_seed(0)
@@ -225,7 +225,8 @@ def test_vq_diffusion_classifier_free_sampling(self):
             "teddy bear playing in the pool",
             num_images_per_prompt=1,
             generator=generator,
-            output_type="np", )
+            output_type="np",
+        )
         image = output.images[0]
         assert image.shape == (256, 256, 3)
         assert np.abs(expected_image - image).max() < 0.01
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
index ce993b9501fb1..c578c2ffb27cd 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
@@ -20,7 +20,7 @@
 
 
 class DDIMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DDIMScheduler, )
+    scheduler_classes = (DDIMScheduler,)
     forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50))
 
     def get_scheduler_config(self, **kwargs):
@@ -65,12 +65,10 @@ def test_steps_offset(self):
         scheduler_config = self.get_scheduler_config(steps_offset=1)
         scheduler = scheduler_class(**scheduler_config)
         scheduler.set_timesteps(5)
-        assert paddle.equal_all(scheduler.timesteps,
-                                paddle.to_tensor([801, 601, 401, 201, 1]))
+        assert paddle.equal_all(scheduler.timesteps, paddle.to_tensor([801, 601, 401, 201, 1]))
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1],
-                                        [0.002, 0.02, 0.2, 2]):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -92,7 +90,8 @@ def test_thresholding(self):
                 self.check_over_configs(
                     thresholding=True,
                     prediction_type=prediction_type,
-                    sample_max_value=threshold, )
+                    sample_max_value=threshold,
+                )
 
     def test_time_indices(self):
         for t in [1, 10, 49]:
@@ -100,8 +99,7 @@ def test_time_indices(self):
 
     def test_inference_steps(self):
         for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
-            self.check_over_forward(
-                time_step=t, num_inference_steps=num_inference_steps)
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
 
     def test_eta(self):
         for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]):
@@ -112,18 +110,12 @@ def test_variance(self):
         scheduler_config = self.get_scheduler_config()
         scheduler = scheduler_class(**scheduler_config)
 
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) -
-                                     0.0)) < 1e-5
-        assert (paddle.sum(
-            paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5)
-        assert (paddle.sum(
-            paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5)
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) -
-                                     0.0)) < 1e-5
-        assert (paddle.sum(
-            paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5)
-        assert paddle.sum(
-            paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
index e9fa28609abda..9768d50cc5dbc 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
@@ -20,7 +20,7 @@
 
 
 class DDPMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DDPMScheduler, )
+    scheduler_classes = (DDPMScheduler,)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -40,8 +40,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1],
-                                        [0.002, 0.02, 0.2, 2]):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -63,7 +62,8 @@ def test_thresholding(self):
                 self.check_over_configs(
                     thresholding=True,
                     prediction_type=prediction_type,
-                    sample_max_value=threshold, )
+                    sample_max_value=threshold,
+                )
 
     def test_prediction_type(self):
         for prediction_type in ["epsilon", "sample", "v_prediction"]:
@@ -79,10 +79,8 @@ def test_variance(self):
         scheduler = scheduler_class(**scheduler_config)
 
         assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(487) -
-                                     0.00979)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(999) -
-                                     0.02)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
 
     def test_full_loop_no_noise(self):
         scheduler_class = self.scheduler_classes[0]
@@ -100,8 +98,7 @@ def test_full_loop_no_noise(self):
             residual = model(sample, t)
 
             # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(
-                residual, t, sample, generator=generator).prev_sample
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
 
             # if t > 0:
             #     noise = self.dummy_sample_deter
@@ -118,8 +115,7 @@ def test_full_loop_no_noise(self):
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         num_trained_timesteps = len(scheduler)
@@ -133,8 +129,7 @@ def test_full_loop_with_v_prediction(self):
             residual = model(sample, t)
 
             # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(
-                residual, t, sample, generator=generator).prev_sample
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
 
             # if t > 0:
             #     noise = self.dummy_sample_deter
@@ -178,13 +173,10 @@ def test_custom_timesteps_increasing_order(self):
 
         timesteps = [100, 87, 50, 51, 0]
 
-        with self.assertRaises(
-                ValueError,
-                msg="`custom_timesteps` must be in descending order."):
+        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
             scheduler.set_timesteps(timesteps=timesteps)
 
-    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(
-            self):
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
         scheduler_class = self.scheduler_classes[0]
         scheduler_config = self.get_scheduler_config()
         scheduler = scheduler_class(**scheduler_config)
@@ -193,11 +185,10 @@ def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(
         num_inference_steps = len(timesteps)
 
         with self.assertRaises(
-                ValueError,
-                msg="Can only pass one of `num_inference_steps` or `custom_timesteps`.",
+            ValueError,
+            msg="Can only pass one of `num_inference_steps` or `custom_timesteps`.",
         ):
-            scheduler.set_timesteps(
-                num_inference_steps=num_inference_steps, timesteps=timesteps)
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
 
     def test_custom_timesteps_too_large(self):
         scheduler_class = self.scheduler_classes[0]
@@ -207,7 +198,7 @@ def test_custom_timesteps_too_large(self):
         timesteps = [scheduler.config.num_train_timesteps]
 
         with self.assertRaises(
-                ValueError,
-                msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
         ):
             scheduler.set_timesteps(timesteps=timesteps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_deis.py b/ppdiffusers/tests/schedulers/test_scheduler_deis.py
index b40af9f177525..7ea11c2198020 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_deis.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_deis.py
@@ -16,15 +16,19 @@
 
 import paddle
 
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
-                         DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
 
 from .test_schedulers import SchedulerCommonTest
 
 
 class DEISMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DEISMultistepScheduler, )
-    forward_default_kwargs = (("num_inference_steps", 25), )
+    scheduler_classes = (DEISMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -43,38 +47,28 @@ def check_over_configs(self, time_step=0, **config):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config(**config)
             scheduler = scheduler_class(**scheduler_config)
             scheduler.set_timesteps(num_inference_steps)
             # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
             output, new_output = sample, sample
-            for t in range(time_step,
-                           time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output,
-                                        **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output,
-                                                **kwargs).prev_sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
 
-                assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                        ), "Scheduler outputs are not identical"
+                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_from_save_pretrained(self):
         pass
@@ -84,9 +78,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config()
@@ -94,8 +86,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
             scheduler.set_timesteps(num_inference_steps)
 
             # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
@@ -104,18 +95,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 new_scheduler.set_timesteps(num_inference_steps)
 
                 # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def full_loop(self, scheduler=None, **config):
         if scheduler is None:
@@ -150,27 +135,20 @@ def test_step_shape(self):
             sample = self.dummy_sample
             residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [
-                residual + 0.2, residual + 0.15, residual + 0.10
-            ]
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             time_step_0 = scheduler.timesteps[5]
             time_step_1 = scheduler.timesteps[6]
 
-            output_0 = scheduler.step(residual, time_step_0, sample,
-                                      **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample,
-                                      **kwargs).prev_sample
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -210,7 +188,8 @@ def test_thresholding(self):
                             sample_max_value=threshold,
                             algorithm_type="deis",
                             solver_order=order,
-                            solver_type=solver_type, )
+                            solver_type=solver_type,
+                        )
 
     def test_prediction_type(self):
         for prediction_type in ["epsilon", "v_prediction"]:
@@ -225,14 +204,15 @@ def test_solver_order_and_type(self):
                             solver_order=order,
                             solver_type=solver_type,
                             prediction_type=prediction_type,
-                            algorithm_type=algorithm_type, )
+                            algorithm_type=algorithm_type,
+                        )
                         sample = self.full_loop(
                             solver_order=order,
                             solver_type=solver_type,
                             prediction_type=prediction_type,
-                            algorithm_type=algorithm_type, )
-                        assert not paddle.isnan(sample).any(
-                        ), "Samples have nan numbers"
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not paddle.isnan(sample).any(), "Samples have nan numbers"
 
     def test_lower_order_final(self):
         self.check_over_configs(lower_order_final=True)
@@ -240,8 +220,7 @@ def test_lower_order_final(self):
 
     def test_inference_steps(self):
         for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(
-                num_inference_steps=num_inference_steps, time_step=0)
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
@@ -257,8 +236,7 @@ def test_full_loop_with_v_prediction(self):
 
     def test_fp16_support(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
         scheduler = scheduler_class(**scheduler_config)
 
         num_inference_steps = 10
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
index 8935cd0ba072e..869b1cc9280d1 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
@@ -16,15 +16,19 @@
 
 import paddle
 
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
-                         DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
 
 from .test_schedulers import SchedulerCommonTest
 
 
 class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DPMSolverMultistepScheduler, )
-    forward_default_kwargs = (("num_inference_steps", 25), )
+    scheduler_classes = (DPMSolverMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -49,38 +53,28 @@ def check_over_configs(self, time_step=0, **config):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config(**config)
             scheduler = scheduler_class(**scheduler_config)
             scheduler.set_timesteps(num_inference_steps)
             # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
             output, new_output = sample, sample
-            for t in range(time_step,
-                           time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output,
-                                        **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output,
-                                                **kwargs).prev_sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
 
-                assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                        ), "Scheduler outputs are not identical"
+                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_from_save_pretrained(self):
         pass
@@ -90,9 +84,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config()
@@ -100,8 +92,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
             scheduler.set_timesteps(num_inference_steps)
 
             # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
@@ -110,18 +101,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 new_scheduler.set_timesteps(num_inference_steps)
 
                 # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def full_loop(self, scheduler=None, **config):
         if scheduler is None:
@@ -152,27 +137,20 @@ def test_step_shape(self):
             sample = self.dummy_sample
             residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [
-                residual + 0.2, residual + 0.15, residual + 0.10
-            ]
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             time_step_0 = scheduler.timesteps[5]
             time_step_1 = scheduler.timesteps[6]
 
-            output_0 = scheduler.step(residual, time_step_0, sample,
-                                      **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample,
-                                      **kwargs).prev_sample
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -193,7 +171,8 @@ def test_thresholding(self):
                             sample_max_value=threshold,
                             algorithm_type="dpmsolver++",
                             solver_order=order,
-                            solver_type=solver_type, )
+                            solver_type=solver_type,
+                        )
 
     def test_prediction_type(self):
         for prediction_type in ["epsilon", "v_prediction"]:
@@ -208,14 +187,15 @@ def test_solver_order_and_type(self):
                             solver_order=order,
                             solver_type=solver_type,
                             prediction_type=prediction_type,
-                            algorithm_type=algorithm_type, )
+                            algorithm_type=algorithm_type,
+                        )
                         sample = self.full_loop(
                             solver_order=order,
                             solver_type=solver_type,
                             prediction_type=prediction_type,
-                            algorithm_type=algorithm_type, )
-                        assert not paddle.isnan(sample).any(
-                        ), "Samples have nan numbers"
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not paddle.isnan(sample).any(), "Samples have nan numbers"
 
     def test_lower_order_final(self):
         self.check_over_configs(lower_order_final=True)
@@ -223,8 +203,7 @@ def test_lower_order_final(self):
 
     def test_inference_steps(self):
         for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(
-                num_inference_steps=num_inference_steps, time_step=0)
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
@@ -233,10 +212,7 @@ def test_full_loop_no_noise(self):
         assert abs(result_mean.item() - 0.3301) < 1e-3
 
     def test_full_loop_no_noise_thres(self):
-        sample = self.full_loop(
-            thresholding=True,
-            dynamic_thresholding_ratio=0.87,
-            sample_max_value=0.5)
+        sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
         result_mean = paddle.mean(paddle.abs(sample))
 
         assert abs(result_mean.item() - 1.1364) < 1e-3
@@ -248,8 +224,7 @@ def test_full_loop_with_v_prediction(self):
         assert abs(result_mean.item() - 0.2251) < 1e-3
 
     def test_full_loop_with_karras_and_v_prediction(self):
-        sample = self.full_loop(
-            prediction_type="v_prediction", use_karras_sigmas=True)
+        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
         result_mean = paddle.mean(paddle.abs(sample))
 
         assert abs(result_mean.item() - 0.2096) < 1e-3
@@ -275,8 +250,7 @@ def test_switch(self):
 
     def test_fp16_support(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
         scheduler = scheduler_class(**scheduler_config)
 
         num_inference_steps = 10
@@ -297,5 +271,4 @@ def test_unique_timesteps(self, **config):
             scheduler = scheduler_class(**scheduler_config)
 
             scheduler.set_timesteps(scheduler.config.num_train_timesteps)
-            assert len(scheduler.timesteps.unique(
-            )) == scheduler.num_inference_steps
+            assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
index bb702887ed40f..ce229323bc363 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
@@ -16,15 +16,19 @@
 
 import paddle
 
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
-                         DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
 
 from .test_schedulers import SchedulerCommonTest
 
 
 class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DPMSolverSinglestepScheduler, )
-    forward_default_kwargs = (("num_inference_steps", 25), )
+    scheduler_classes = (DPMSolverSinglestepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -48,38 +52,28 @@ def check_over_configs(self, time_step=0, **config):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config(**config)
             scheduler = scheduler_class(**scheduler_config)
             scheduler.set_timesteps(num_inference_steps)
             # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
             output, new_output = sample, sample
-            for t in range(time_step,
-                           time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output,
-                                        **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output,
-                                                **kwargs).prev_sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
 
-                assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                        ), "Scheduler outputs are not identical"
+                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_from_save_pretrained(self):
         pass
@@ -89,9 +83,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config()
@@ -99,8 +91,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
             scheduler.set_timesteps(num_inference_steps)
 
             # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
@@ -109,18 +100,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 new_scheduler.set_timesteps(num_inference_steps)
 
                 # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def full_loop(self, scheduler=None, **config):
         if scheduler is None:
@@ -178,7 +163,8 @@ def test_thresholding(self):
                             sample_max_value=threshold,
                             algorithm_type="dpmsolver++",
                             solver_order=order,
-                            solver_type=solver_type, )
+                            solver_type=solver_type,
+                        )
 
     def test_prediction_type(self):
         for prediction_type in ["epsilon", "v_prediction"]:
@@ -193,14 +179,15 @@ def test_solver_order_and_type(self):
                             solver_order=order,
                             solver_type=solver_type,
                             prediction_type=prediction_type,
-                            algorithm_type=algorithm_type, )
+                            algorithm_type=algorithm_type,
+                        )
                         sample = self.full_loop(
                             solver_order=order,
                             solver_type=solver_type,
                             prediction_type=prediction_type,
-                            algorithm_type=algorithm_type, )
-                        assert not paddle.isnan(sample).any(
-                        ), "Samples have nan numbers"
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not paddle.isnan(sample).any(), "Samples have nan numbers"
 
     def test_lower_order_final(self):
         self.check_over_configs(lower_order_final=True)
@@ -208,8 +195,7 @@ def test_lower_order_final(self):
 
     def test_inference_steps(self):
         for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(
-                num_inference_steps=num_inference_steps, time_step=0)
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
@@ -225,8 +211,7 @@ def test_full_loop_with_v_prediction(self):
 
     def test_fp16_support(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
         scheduler = scheduler_class(**scheduler_config)
 
         num_inference_steps = 10
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler.py b/ppdiffusers/tests/schedulers/test_scheduler_euler.py
index bdca25bba1cb3..d6cfc9fe4474b 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_euler.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_euler.py
@@ -20,7 +20,7 @@
 
 
 class EulerDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (EulerDiscreteScheduler, )
+    scheduler_classes = (EulerDiscreteScheduler,)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
-                                        [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -68,8 +67,7 @@ def test_full_loop_no_noise(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -80,8 +78,7 @@ def test_full_loop_no_noise(self):
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         scheduler.set_timesteps(self.num_inference_steps)
@@ -96,8 +93,7 @@ def test_full_loop_with_v_prediction(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -123,8 +119,7 @@ def test_full_loop_device(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -150,8 +145,7 @@ def test_full_loop_device_karras_sigmas(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
index cb2d308947d3b..fdc7f2a34f30f 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
@@ -20,7 +20,7 @@
 
 
 class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (EulerAncestralDiscreteScheduler, )
+    scheduler_classes = (EulerAncestralDiscreteScheduler,)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
-                                        [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -68,8 +67,7 @@ def test_full_loop_no_noise(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -80,8 +78,7 @@ def test_full_loop_no_noise(self):
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         scheduler.set_timesteps(self.num_inference_steps)
@@ -96,8 +93,7 @@ def test_full_loop_with_v_prediction(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -122,8 +118,7 @@ def test_full_loop_device(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_heun.py b/ppdiffusers/tests/schedulers/test_scheduler_heun.py
index b8223700592bb..0f62ae519f4e0 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_heun.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_heun.py
@@ -20,7 +20,7 @@
 
 
 class HeunDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (HeunDiscreteScheduler, )
+    scheduler_classes = (HeunDiscreteScheduler,)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
-                                        [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -78,8 +77,7 @@ def test_full_loop_no_noise(self):
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         scheduler.set_timesteps(self.num_inference_steps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
index 39558436871af..c282c6a61079b 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
@@ -22,8 +22,8 @@
 
 
 class IPNDMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (IPNDMScheduler, )
-    forward_default_kwargs = (("num_inference_steps", 50), )
+    scheduler_classes = (IPNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
 
     def get_scheduler_config(self, **kwargs):
         config = {"num_train_timesteps": 1000}
@@ -59,21 +59,15 @@ def check_over_configs(self, time_step=0, **config):
                 # copy over dummy past residuals
                 new_scheduler.ets = dummy_past_residuals[:]
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_from_save_pretrained(self):
         pass
@@ -110,21 +104,15 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 # copy over dummy past residual (must be after setting timesteps)
                 new_scheduler.ets = dummy_past_residuals[:]
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def full_loop(self, **config):
         scheduler_class = self.scheduler_classes[0]
@@ -158,11 +146,9 @@ def test_step_shape(self):
             sample = self.dummy_sample
             residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
@@ -177,31 +163,25 @@ def test_step_shape(self):
             time_step_0 = scheduler.timesteps[5]
             time_step_1 = scheduler.timesteps[6]
 
-            output_0 = scheduler.step(residual, time_step_0, sample,
-                                      **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample,
-                                      **kwargs).prev_sample
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
 
-            output_0 = scheduler.step(residual, time_step_0, sample,
-                                      **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample,
-                                      **kwargs).prev_sample
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
 
     def test_timesteps(self):
         for timesteps in [100, 1000]:
-            self.check_over_configs(
-                num_train_timesteps=timesteps, time_step=None)
+            self.check_over_configs(num_train_timesteps=timesteps, time_step=None)
 
     def test_inference_steps(self):
         for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
-            self.check_over_forward(
-                num_inference_steps=num_inference_steps, time_step=None)
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None)
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
index 4081289cebb20..770b4f226ba5c 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
@@ -20,7 +20,7 @@
 
 
 class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (KDPM2AncestralDiscreteScheduler, )
+    scheduler_classes = (KDPM2AncestralDiscreteScheduler,)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
-                                        [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -65,8 +64,7 @@ def test_full_loop_no_noise(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -82,8 +80,7 @@ def test_prediction_type(self):
     def test_full_loop_with_v_prediction(self):
 
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         scheduler.set_timesteps(self.num_inference_steps)
@@ -98,8 +95,7 @@ def test_full_loop_with_v_prediction(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -125,8 +121,7 @@ def test_full_loop_device(self):
 
             model_output = model(sample, t)
 
-            output = scheduler.step(
-                model_output, t, sample, generator=generator)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = paddle.sum(paddle.abs(sample))
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
index ee87c662588d7..3da7b7e75fd44 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
@@ -20,7 +20,7 @@
 
 
 class KDPM2DiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (KDPM2DiscreteScheduler, )
+    scheduler_classes = (KDPM2DiscreteScheduler,)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
-                                        [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -53,8 +52,7 @@ def test_prediction_type(self):
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         scheduler.set_timesteps(self.num_inference_steps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_lms.py b/ppdiffusers/tests/schedulers/test_scheduler_lms.py
index 0be32200e94c8..8ee87bbddf624 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_lms.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_lms.py
@@ -20,7 +20,7 @@
 
 
 class LMSDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (LMSDiscreteScheduler, )
+    scheduler_classes = (LMSDiscreteScheduler,)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -39,8 +39,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001],
-                                        [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
@@ -81,8 +80,7 @@ def test_full_loop_no_noise(self):
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            prediction_type="v_prediction")
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
         scheduler = scheduler_class(**scheduler_config)
 
         scheduler.set_timesteps(self.num_inference_steps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
index ab94b8ffca3f3..ad2998c26bfd9 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
@@ -22,8 +22,8 @@
 
 
 class PNDMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (PNDMScheduler, )
-    forward_default_kwargs = (("num_inference_steps", 50), )
+    scheduler_classes = (PNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -62,21 +62,15 @@ def check_over_configs(self, time_step=0, **config):
                 # copy over dummy past residuals
                 new_scheduler.ets = dummy_past_residuals[:]
 
-            output = scheduler.step_prk(residual, time_step, sample,
-                                        **kwargs).prev_sample
-            new_output = new_scheduler.step_prk(residual, time_step, sample,
-                                                **kwargs).prev_sample
+            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
-            output = scheduler.step_plms(residual, time_step, sample,
-                                         **kwargs).prev_sample
-            new_output = new_scheduler.step_plms(residual, time_step, sample,
-                                                 **kwargs).prev_sample
+            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_from_save_pretrained(self):
         pass
@@ -110,21 +104,15 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 # copy over dummy past residual (must be after setting timesteps)
                 new_scheduler.ets = dummy_past_residuals[:]
 
-            output = scheduler.step_prk(residual, time_step, sample,
-                                        **kwargs).prev_sample
-            new_output = new_scheduler.step_prk(residual, time_step, sample,
-                                                **kwargs).prev_sample
+            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
-            output = scheduler.step_plms(residual, time_step, sample,
-                                         **kwargs).prev_sample
-            new_output = new_scheduler.step_plms(residual, time_step, sample,
-                                                 **kwargs).prev_sample
+            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def full_loop(self, **config):
         scheduler_class = self.scheduler_classes[0]
@@ -158,11 +146,9 @@ def test_step_shape(self):
             sample = self.dummy_sample
             residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
@@ -174,18 +160,14 @@ def test_step_shape(self):
             ]
             scheduler.ets = dummy_past_residuals[:]
 
-            output_0 = scheduler.step_prk(residual, 0, sample,
-                                          **kwargs).prev_sample
-            output_1 = scheduler.step_prk(residual, 1, sample,
-                                          **kwargs).prev_sample
+            output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
 
-            output_0 = scheduler.step_plms(residual, 0, sample,
-                                           **kwargs).prev_sample
-            output_1 = scheduler.step_plms(residual, 1, sample,
-                                           **kwargs).prev_sample
+            output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -204,27 +186,30 @@ def test_steps_offset(self):
         scheduler.set_timesteps(10)
         assert paddle.equal_all(
             scheduler.timesteps,
-            paddle.to_tensor([
-                901,
-                851,
-                851,
-                801,
-                801,
-                751,
-                751,
-                701,
-                701,
-                651,
-                651,
-                601,
-                601,
-                501,
-                401,
-                301,
-                201,
-                101,
-                1,
-            ]), )
+            paddle.to_tensor(
+                [
+                    901,
+                    851,
+                    851,
+                    801,
+                    801,
+                    751,
+                    751,
+                    701,
+                    701,
+                    651,
+                    651,
+                    601,
+                    601,
+                    501,
+                    401,
+                    301,
+                    201,
+                    101,
+                    1,
+                ]
+            ),
+        )
 
     def test_betas(self):
         for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
@@ -269,8 +254,7 @@ def test_inference_plms_no_past_residuals(self):
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
-            scheduler.step_plms(self.dummy_sample, 1,
-                                self.dummy_sample).prev_sample
+            scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
index 3c2c1cd8ac641..ac15c502eda8d 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
@@ -23,7 +23,7 @@
 
 class ScoreSdeVeSchedulerTest(unittest.TestCase):
     # TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration)
-    scheduler_classes = (ScoreSdeVeScheduler, )
+    scheduler_classes = (ScoreSdeVeScheduler,)
     forward_default_kwargs = ()
 
     @property
@@ -85,34 +85,22 @@ def check_over_configs(self, time_step=0, **config):
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             output = scheduler.step_pred(
-                residual,
-                time_step,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
             new_output = new_scheduler.step_pred(
-                residual,
-                time_step,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
             output = scheduler.step_correct(
-                residual,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
             new_output = new_scheduler.step_correct(
-                residual,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler correction are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
 
     def check_over_forward(self, time_step=0, **forward_kwargs):
         kwargs = dict(self.forward_default_kwargs)
@@ -130,34 +118,22 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             output = scheduler.step_pred(
-                residual,
-                time_step,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
             new_output = new_scheduler.step_pred(
-                residual,
-                time_step,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
             output = scheduler.step_correct(
-                residual,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
             new_output = new_scheduler.step_correct(
-                residual,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler correction are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
 
     def test_timesteps(self):
         for timesteps in [10, 100, 1000]:
@@ -193,15 +169,12 @@ def test_full_loop_no_noise(self):
             for _ in range(scheduler.config.correct_steps):
                 with paddle.no_grad():
                     model_output = model(sample, sigma_t)
-                sample = scheduler.step_correct(
-                    model_output, sample, generator=generator,
-                    **kwargs).prev_sample
+                sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample
 
             with paddle.no_grad():
                 model_output = model(sample, sigma_t)
 
-            output = scheduler.step_pred(
-                model_output, t, sample, generator=generator, **kwargs)
+            output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs)
             sample, _ = output.prev_sample, output.prev_sample_mean
 
         result_sum = paddle.sum(paddle.abs(sample))
@@ -222,25 +195,17 @@ def test_step_shape(self):
             sample = self.dummy_sample
             residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             output_0 = scheduler.step_pred(
-                residual,
-                0,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, 0, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
             output_1 = scheduler.step_pred(
-                residual,
-                1,
-                sample,
-                generator=paddle.Generator().manual_seed(0),
-                **kwargs).prev_sample
+                residual, 1, sample, generator=paddle.Generator().manual_seed(0), **kwargs
+            ).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
index 5ac931e6abef5..b37fa2c513271 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
@@ -21,7 +21,7 @@
 
 # UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration.
 class UnCLIPSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (UnCLIPScheduler, )
+    scheduler_classes = (UnCLIPScheduler,)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -61,36 +61,27 @@ def test_time_indices(self):
                 if prev_timestep is not None and prev_timestep >= time_step:
                     continue
 
-                self.check_over_forward(
-                    time_step=time_step, prev_timestep=prev_timestep)
+                self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep)
 
     def test_variance_fixed_small_log(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            variance_type="fixed_small_log")
+        scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log")
         scheduler = scheduler_class(**scheduler_config)
 
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0) -
-                                     1.0000e-10)) < 1e-5
-        assert paddle.sum(
-            paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
-        assert paddle.sum(
-            paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
+        assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
 
     def test_variance_learned_range(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            variance_type="learned_range")
+        scheduler_config = self.get_scheduler_config(variance_type="learned_range")
         scheduler = scheduler_class(**scheduler_config)
 
         predicted_variance = 0.5
 
-        assert (scheduler._get_variance(
-            1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5)
-        assert (scheduler._get_variance(
-            487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5)
-        assert (scheduler._get_variance(
-            999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5)
+        assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5
+        assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5
+        assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5
 
     def test_full_loop(self):
         scheduler_class = self.scheduler_classes[0]
@@ -108,8 +99,7 @@ def test_full_loop(self):
             residual = model(sample, t)
 
             # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(
-                residual, t, sample, generator=generator).prev_sample
+            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
 
             sample = pred_prev_sample
 
@@ -143,11 +133,8 @@ def test_full_loop_skip_timesteps(self):
 
             # 2. predict previous mean of sample x_t-1
             pred_prev_sample = scheduler.step(
-                residual,
-                t,
-                sample,
-                prev_timestep=prev_timestep,
-                generator=generator).prev_sample
+                residual, t, sample, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
 
             sample = pred_prev_sample
 
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
index 7d28f06cd5fb7..0c19a3bb8387a 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
@@ -16,15 +16,19 @@
 
 import paddle
 
-from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler,
-                         DPMSolverSinglestepScheduler, UniPCMultistepScheduler)
+from ppdiffusers import (
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    UniPCMultistepScheduler,
+)
 
 from .test_schedulers import SchedulerCommonTest
 
 
 class UniPCMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (UniPCMultistepScheduler, )
-    forward_default_kwargs = (("num_inference_steps", 25), )
+    scheduler_classes = (UniPCMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -44,47 +48,35 @@ def check_over_configs(self, time_step=0, **config):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config(**config)
             scheduler = scheduler_class(**scheduler_config)
             scheduler.set_timesteps(num_inference_steps)
             # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
             output, new_output = sample, sample
-            for t in range(time_step,
-                           time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output,
-                                        **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output,
-                                                **kwargs).prev_sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
 
-                assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                        ), "Scheduler outputs are not identical"
+                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def check_over_forward(self, time_step=0, **forward_kwargs):
         kwargs = dict(self.forward_default_kwargs)
         num_inference_steps = kwargs.pop("num_inference_steps", None)
         sample = self.dummy_sample
         residual = 0.1 * sample
-        dummy_past_residuals = [
-            residual + 0.2, residual + 0.15, residual + 0.10
-        ]
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
 
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config()
@@ -92,8 +84,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
             scheduler.set_timesteps(num_inference_steps)
 
             # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
@@ -102,18 +93,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 new_scheduler.set_timesteps(num_inference_steps)
 
                 # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[:
-                                                                   new_scheduler.
-                                                                   config.
-                                                                   solver_order]
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
 
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def full_loop(self, scheduler=None, **config):
         if scheduler is None:
@@ -148,27 +133,20 @@ def test_step_shape(self):
             sample = self.dummy_sample
             residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [
-                residual + 0.2, residual + 0.15, residual + 0.10
-            ]
-            scheduler.model_outputs = dummy_past_residuals[:scheduler.config.
-                                                           solver_order]
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
 
             time_step_0 = scheduler.timesteps[5]
             time_step_1 = scheduler.timesteps[6]
 
-            output_0 = scheduler.step(residual, time_step_0, sample,
-                                      **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample,
-                                      **kwargs).prev_sample
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -207,7 +185,8 @@ def test_thresholding(self):
                             prediction_type=prediction_type,
                             sample_max_value=threshold,
                             solver_order=order,
-                            solver_type=solver_type, )
+                            solver_type=solver_type,
+                        )
 
     def test_prediction_type(self):
         for prediction_type in ["epsilon", "v_prediction"]:
@@ -220,13 +199,14 @@ def test_solver_order_and_type(self):
                     self.check_over_configs(
                         solver_order=order,
                         solver_type=solver_type,
-                        prediction_type=prediction_type, )
+                        prediction_type=prediction_type,
+                    )
                     sample = self.full_loop(
                         solver_order=order,
                         solver_type=solver_type,
-                        prediction_type=prediction_type, )
-                    assert not paddle.isnan(sample).any(
-                    ), "Samples have nan numbers"
+                        prediction_type=prediction_type,
+                    )
+                    assert not paddle.isnan(sample).any(), "Samples have nan numbers"
 
     def test_lower_order_final(self):
         self.check_over_configs(lower_order_final=True)
@@ -234,8 +214,7 @@ def test_lower_order_final(self):
 
     def test_inference_steps(self):
         for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(
-                num_inference_steps=num_inference_steps, time_step=0)
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
 
     def test_full_loop_no_noise(self):
         sample = self.full_loop()
@@ -251,8 +230,7 @@ def test_full_loop_with_v_prediction(self):
 
     def test_fp16_support(self):
         scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(
-            thresholding=True, dynamic_thresholding_ratio=0)
+        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
         scheduler = scheduler_class(**scheduler_config)
 
         num_inference_steps = 10
@@ -272,5 +250,4 @@ def test_unique_timesteps(self, **config):
             scheduler = scheduler_class(**scheduler_config)
 
             scheduler.set_timesteps(scheduler.config.num_train_timesteps)
-            assert len(scheduler.timesteps.unique(
-            )) == scheduler.num_inference_steps
+            assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
index 81ed3de4a1062..c40e7834d682f 100644
--- a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
+++ b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
@@ -21,7 +21,7 @@
 
 
 class VQDiffusionSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (VQDiffusionScheduler, )
+    scheduler_classes = (VQDiffusionScheduler,)
 
     def get_scheduler_config(self, **kwargs):
         config = {
@@ -37,8 +37,7 @@ def dummy_sample(self, num_vec_classes):
         height = 8
         width = 8
 
-        sample = paddle.randint(0, num_vec_classes,
-                                (batch_size, height * width))
+        sample = paddle.randint(0, num_vec_classes, (batch_size, height * width))
 
         return sample
 
@@ -49,10 +48,8 @@ def dummy_sample_deter(self):
     def dummy_model(self, num_vec_classes):
         def model(sample, t, *args):
             batch_size, num_latent_pixels = sample.shape
-            logits = paddle.rand(
-                (batch_size, num_vec_classes - 1, num_latent_pixels))
-            return_value = F.log_softmax(
-                logits.cast("float64"), axis=1).cast("float32")
+            logits = paddle.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
+            return_value = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
             return return_value
 
         return model
diff --git a/ppdiffusers/tests/schedulers/test_schedulers.py b/ppdiffusers/tests/schedulers/test_schedulers.py
index f01069d246e6a..92b11a679f661 100755
--- a/ppdiffusers/tests/schedulers/test_schedulers.py
+++ b/ppdiffusers/tests/schedulers/test_schedulers.py
@@ -24,9 +24,14 @@
 import paddle
 
 import ppdiffusers
-from ppdiffusers import (EulerAncestralDiscreteScheduler,
-                         EulerDiscreteScheduler, IPNDMScheduler,
-                         LMSDiscreteScheduler, VQDiffusionScheduler, logging)
+from ppdiffusers import (
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    IPNDMScheduler,
+    LMSDiscreteScheduler,
+    VQDiffusionScheduler,
+    logging,
+)
 from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
 from ppdiffusers.schedulers.scheduling_utils import SchedulerMixin
 from ppdiffusers.utils.testing_utils import CaptureLogger
@@ -37,12 +42,13 @@ class SchedulerObject(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            a=2,
-            b=5,
-            c=(2, 5),
-            d="for diffusion",
-            e=[1, 3], ):
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+    ):
         pass
 
 
@@ -51,12 +57,13 @@ class SchedulerObject2(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            a=2,
-            b=5,
-            c=(2, 5),
-            d="for diffusion",
-            f=[1, 3], ):
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        f=[1, 3],
+    ):
         pass
 
 
@@ -65,13 +72,14 @@ class SchedulerObject3(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            a=2,
-            b=5,
-            c=(2, 5),
-            d="for diffusion",
-            e=[1, 3],
-            f=[1, 3], ):
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+        f=[1, 3],
+    ):
         pass
 
 
@@ -90,15 +98,11 @@ def test_save_load_from_different_config(self):
                 new_obj_1 = SchedulerObject2.from_config(config)
 
             # now save a config parameter that is not expected
-            with open(
-                    os.path.join(tmpdirname, SchedulerObject.config_name),
-                    "r") as f:
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
                 data = json.load(f)
                 data["unexpected"] = True
 
-            with open(
-                    os.path.join(tmpdirname, SchedulerObject.config_name),
-                    "w") as f:
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
                 json.dump(data, f)
 
             with CaptureLogger(logger) as cap_logger_2:
@@ -115,12 +119,12 @@ def test_save_load_from_different_config(self):
 
         assert cap_logger_1.out == ""
         assert (
-            cap_logger_2.out ==
-            "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+            cap_logger_2.out
+            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
             " will"
-            " be ignored. Please verify your config.json configuration file.\n")
-        assert (cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2")
-                == cap_logger_3.out)
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+        assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out
 
     def test_save_load_compatible_schedulers(self):
         SchedulerObject2._compatibles = ["SchedulerObject"]
@@ -137,16 +141,12 @@ def test_save_load_compatible_schedulers(self):
             obj.save_config(tmpdirname)
 
             # now save a config parameter that is expected by another class, but not origin class
-            with open(
-                    os.path.join(tmpdirname, SchedulerObject.config_name),
-                    "r") as f:
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
                 data = json.load(f)
                 data["f"] = [0, 0]
                 data["unexpected"] = True
 
-            with open(
-                    os.path.join(tmpdirname, SchedulerObject.config_name),
-                    "w") as f:
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
                 json.dump(data, f)
 
             with CaptureLogger(logger) as cap_logger:
@@ -156,10 +156,11 @@ def test_save_load_compatible_schedulers(self):
         assert new_obj.__class__ == SchedulerObject
 
         assert (
-            cap_logger.out ==
-            "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+            cap_logger.out
+            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
             " will"
-            " be ignored. Please verify your config.json configuration file.\n")
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
 
     def test_save_load_from_different_config_comp_schedulers(self):
         SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"]
@@ -195,14 +196,8 @@ def test_save_load_from_different_config_comp_schedulers(self):
         assert new_obj_3.__class__ == SchedulerObject3
 
         assert cap_logger_1.out == ""
-        assert (
-            cap_logger_2.out ==
-            "{'f'} was not found in config. Values will be initialized to default values.\n"
-        )
-        assert (
-            cap_logger_3.out ==
-            "{'f'} was not found in config. Values will be initialized to default values.\n"
-        )
+        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
 
 
 class SchedulerCommonTest(unittest.TestCase):
@@ -252,9 +247,10 @@ def check_over_configs(self, time_step=0, **config):
         for scheduler_class in self.scheduler_classes:
             # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
             if scheduler_class in (
-                    EulerAncestralDiscreteScheduler,
-                    EulerDiscreteScheduler,
-                    LMSDiscreteScheduler, ):
+                EulerAncestralDiscreteScheduler,
+                EulerDiscreteScheduler,
+                LMSDiscreteScheduler,
+            ):
                 time_step = float(time_step)
 
             scheduler_config = self.get_scheduler_config(**config)
@@ -273,12 +269,10 @@ def check_over_configs(self, time_step=0, **config):
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
                 new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # Make sure `scale_model_input` is invoked to prevent a warning
@@ -287,20 +281,15 @@ def check_over_configs(self, time_step=0, **config):
                 _ = new_scheduler.scale_model_input(sample, 0)
 
             # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def check_over_forward(self, time_step=0, **forward_kwargs):
         kwargs = dict(self.forward_default_kwargs)
@@ -310,9 +299,10 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
         for scheduler_class in self.scheduler_classes:
             if scheduler_class in (
-                    EulerAncestralDiscreteScheduler,
-                    EulerDiscreteScheduler,
-                    LMSDiscreteScheduler, ):
+                EulerAncestralDiscreteScheduler,
+                EulerDiscreteScheduler,
+                LMSDiscreteScheduler,
+            ):
                 time_step = float(time_step)
 
             scheduler_config = self.get_scheduler_config()
@@ -331,28 +321,21 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
                 new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            output = scheduler.step(residual, time_step, sample,
-                                    **kwargs).prev_sample
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            new_output = new_scheduler.step(residual, time_step, sample,
-                                            **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_from_save_pretrained(self):
         kwargs = dict(self.forward_default_kwargs)
@@ -362,9 +345,10 @@ def test_from_save_pretrained(self):
         for scheduler_class in self.scheduler_classes:
             timestep = 1
             if scheduler_class in (
-                    EulerAncestralDiscreteScheduler,
-                    EulerDiscreteScheduler,
-                    LMSDiscreteScheduler, ):
+                EulerAncestralDiscreteScheduler,
+                EulerDiscreteScheduler,
+                LMSDiscreteScheduler,
+            ):
                 timestep = float(timestep)
 
             scheduler_config = self.get_scheduler_config()
@@ -383,28 +367,21 @@ def test_from_save_pretrained(self):
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
                 new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            output = scheduler.step(residual, timestep, sample,
-                                    **kwargs).prev_sample
+            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
 
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            new_output = new_scheduler.step(residual, timestep, sample,
-                                            **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
 
-            assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5
-                    ), "Scheduler outputs are not identical"
+            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
     def test_compatibles(self):
         for scheduler_class in self.scheduler_classes:
@@ -415,31 +392,20 @@ def test_compatibles(self):
             assert all(c is not None for c in scheduler.compatibles)
 
             for comp_scheduler_cls in scheduler.compatibles:
-                comp_scheduler = comp_scheduler_cls.from_config(
-                    scheduler.config)
+                comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)
                 assert comp_scheduler is not None
 
             new_scheduler = scheduler_class.from_config(comp_scheduler.config)
 
-            new_scheduler_config = {
-                k: v
-                for k, v in new_scheduler.config.items()
-                if k in scheduler.config
-            }
-            scheduler_diff = {
-                k: v
-                for k, v in new_scheduler.config.items()
-                if k not in scheduler.config
-            }
+            new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}
+            scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}
 
             # make sure that configs are essentially identical
             assert new_scheduler_config == dict(scheduler.config)
 
             # make sure that only differences are for configs that are not in init
-            init_keys = inspect.signature(
-                scheduler_class.__init__).parameters.keys()
-            assert set(scheduler_diff.keys()).intersection(set(
-                init_keys)) == set()
+            init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()
+            assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()
 
     def test_from_pretrained(self):
         for scheduler_class in self.scheduler_classes:
@@ -463,9 +429,10 @@ def test_step_shape(self):
 
         for scheduler_class in self.scheduler_classes:
             if scheduler_class in (
-                    EulerAncestralDiscreteScheduler,
-                    EulerDiscreteScheduler,
-                    LMSDiscreteScheduler, ):
+                EulerAncestralDiscreteScheduler,
+                EulerDiscreteScheduler,
+                LMSDiscreteScheduler,
+            ):
                 timestep_0 = float(timestep_0)
                 timestep_1 = float(timestep_1)
 
@@ -481,17 +448,13 @@ def test_step_shape(self):
                 sample = self.dummy_sample
                 residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            output_0 = scheduler.step(residual, timestep_0, sample,
-                                      **kwargs).prev_sample
-            output_1 = scheduler.step(residual, timestep_1, sample,
-                                      **kwargs).prev_sample
+            output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -504,12 +467,10 @@ def set_nan_tensor_to_zero(t):
 
         def recursive_check(tuple_object, dict_object):
             if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object, dict_object.values()):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
                     recursive_check(tuple_iterable_value, dict_iterable_value)
             elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object.values(), dict_object.values()):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
                     recursive_check(tuple_iterable_value, dict_iterable_value)
             elif tuple_object is None:
                 return
@@ -518,27 +479,29 @@ def recursive_check(tuple_object, dict_object):
                     paddle.allclose(
                         set_nan_tensor_to_zero(tuple_object).cast("float32"),
                         set_nan_tensor_to_zero(dict_object).cast("float32"),
-                        atol=1e-5, ),
+                        atol=1e-5,
+                    ),
                     msg=(
                         "Tuple and dict output are not equal. Difference:"
                         f" {paddle.max(paddle.abs(tuple_object - dict_object))}. Tuple has `nan`:"
                         f" {paddle.isnan(tuple_object).any()} and `inf`: {paddle.isinf(tuple_object)}. Dict has"
                         f" `nan`: {paddle.isnan(dict_object).any()} and `inf`: {paddle.isinf(dict_object)}."
-                    ), )
+                    ),
+                )
 
         kwargs = dict(self.forward_default_kwargs)
         num_inference_steps = kwargs.pop("num_inference_steps", 50)
 
         timestep = 0
-        if (len(self.scheduler_classes) > 0 and
-                self.scheduler_classes[0] == IPNDMScheduler):
+        if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler:
             timestep = 1
 
         for scheduler_class in self.scheduler_classes:
             if scheduler_class in (
-                    EulerAncestralDiscreteScheduler,
-                    EulerDiscreteScheduler,
-                    LMSDiscreteScheduler, ):
+                EulerAncestralDiscreteScheduler,
+                EulerDiscreteScheduler,
+                LMSDiscreteScheduler,
+            ):
                 timestep = float(timestep)
 
             scheduler_config = self.get_scheduler_config()
@@ -553,32 +516,25 @@ def recursive_check(tuple_object, dict_object):
                 sample = self.dummy_sample
                 residual = 0.1 * sample
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
             outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
 
-            if num_inference_steps is not None and hasattr(scheduler,
-                                                           "set_timesteps"):
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                    scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(
-                    inspect.signature(scheduler.step).parameters.keys()):
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
                 kwargs["generator"] = paddle.Generator().manual_seed(0)
-            outputs_tuple = scheduler.step(
-                residual, timestep, sample, return_dict=False, **kwargs)
+            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
 
             recursive_check(outputs_tuple, outputs_dict)
 
@@ -594,8 +550,11 @@ def test_scheduler_public_api(self):
                 )
                 self.assertTrue(
                     hasattr(scheduler, "scale_model_input"),
-                    (f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
-                     " timestep)`"), )
+                    (
+                        f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
+                        " timestep)`"
+                    ),
+                )
             self.assertTrue(
                 hasattr(scheduler, "step"),
                 f"{scheduler_class} does not implement a required class method `step(...)`",
@@ -625,9 +584,7 @@ def test_add_noise_device(self):
 
     def test_deprecated_kwargs(self):
         for scheduler_class in self.scheduler_classes:
-            has_kwarg_in_model_class = (
-                "kwargs" in
-                inspect.signature(scheduler_class.__init__).parameters)
+            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
             has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
 
             if has_kwarg_in_model_class and not has_deprecated_kwarg:
@@ -635,7 +592,8 @@ def test_deprecated_kwargs(self):
                     f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
                     " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
                     " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                    " [<deprecated_argument>]`")
+                    " [<deprecated_argument>]`"
+                )
 
             if not has_kwarg_in_model_class and has_deprecated_kwarg:
                 raise ValueError(
@@ -651,8 +609,7 @@ def test_trained_betas(self):
                 continue
 
             scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(
-                **scheduler_config, trained_betas=np.array([0.1, 0.3]))
+            scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3]))
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_pretrained(tmpdirname)
@@ -680,8 +637,7 @@ def test_getattr_is_correct(self):
             # no warning should be thrown
             assert cap_logger.out == ""
 
-            logger = logging.get_logger(
-                "ppdiffusers.schedulers.schedulering_utils")
+            logger = logging.get_logger("ppdiffusers.schedulers.schedulering_utils")
             # 30 for warning
             logger.setLevel(30)
             with CaptureLogger(logger) as cap_logger:
@@ -703,7 +659,4 @@ def test_getattr_is_correct(self):
             with self.assertRaises(AttributeError) as error:
                 scheduler.does_not_exist
 
-            assert (
-                str(error.exception) ==
-                f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"
-            )
+            assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000..4b868b99b22f9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.isort]
+profile = 'black'
+known_third_party = ["paddle"]
+
+[tool.black]
+line-length = 119
+target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
+exclude = ['.flake8']
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+pythonpath = ["."]
+testpaths = [
+    # "tests/models",
+]
+python_files = [
+    "test.py",
+    "test_*.py"
+]
+filterwarnings = [
+    "ignore::UserWarning",
+    'ignore::DeprecationWarning',
+]
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2578b4bad4f96..0074ba09ce033 100644
--- a/setup.py
+++ b/setup.py
@@ -46,8 +46,7 @@ def read_requirements():
 
 setup(
     name="paddlemix",
-    packages=(find_packages() + find_packages(
-        where="./ppdiffusers", exclude=["tests", "tests.*"])),
+    packages=(find_packages() + find_packages(where="./ppdiffusers", exclude=["tests", "tests.*"])),
     package_dir={
         "": ".",
         "ppdiffusers": "./ppdiffusers/ppdiffusers",
@@ -62,10 +61,7 @@ def read_requirements():
     keywords=["paddle", "paddlemix"],
     install_requires=REQUIRED_PACKAGES,
     python_requires=">=3.6",
-    entry_points={
-        "console_scripts":
-        ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]
-    },
+    entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]},
     classifiers=[
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
@@ -75,4 +71,5 @@ def read_requirements():
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
     ],
-    license="Apache 2.0", )
+    license="Apache 2.0",
+)
diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py
index 4193ca0f0a0de..d11db96722581 100644
--- a/tests/models/test_blip2.py
+++ b/tests/models/test_blip2.py
@@ -21,49 +21,54 @@
 import numpy as np
 import paddle
 import paddle.nn as nn
-import requests
 from paddlenlp.transformers.opt.configuration import OPTConfig
-from PIL import Image
 
-from paddlemix.models.blip2 import (Blip2Config, Blip2ForConditionalGeneration,
-                                    Blip2QFormerConfig, Blip2VisionConfig)
+from paddlemix.models.blip2 import (
+    Blip2Config,
+    Blip2ForConditionalGeneration,
+    Blip2QFormerConfig,
+    Blip2VisionConfig,
+)
 from paddlemix.models.blip2.eva_vit import VisionTransformer
-from paddlemix.models.blip2.modeling import \
-    BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST
+from paddlemix.models.blip2.modeling import BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST
 from paddlemix.models.blip2.Qformer import BertLMHeadModel
 from tests.models.test_configuration_common import ConfigTester
 from tests.models.test_modeling_common import (
-    ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask)
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
 from tests.testing_utils import slow
 
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if ("_range" in key or "_std" in key or "initializer_factor" in key or
-                "layer_scale" in key):
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
     return configs_no_init
 
 
 class Blip2VisionModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=12,
-            image_size=30,
-            patch_size=2,
-            num_channels=3,
-            is_training=True,
-            hidden_size=1408,
-            projection_dim=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            dropout=0.1,
-            attention_dropout=0.1,
-            initializer_range=1e-10,
-            scope=None, ):
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=1408,
+        projection_dim=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
@@ -81,13 +86,11 @@ def __init__(
         self.scope = scope
 
         # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size)**2
+        num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([
-            self.batch_size, self.num_channels, self.image_size, self.image_size
-        ])
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
         config = self.get_config()
 
         return config, pixel_values
@@ -104,7 +107,8 @@ def get_config(self):
             intermediate_size=self.intermediate_size,
             dropout=self.dropout,
             attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range, )
+            initializer_range=self.initializer_range,
+        )
 
     def create_and_check_model(self, config, pixel_values):
         model = VisionTransformer(config=config)
@@ -114,13 +118,12 @@ def create_and_check_model(self, config, pixel_values):
         # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
         image_size = (self.image_size, self.image_size)
         patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
-                                                          patch_size[0])
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            [self.batch_size, num_patches + 1, self.hidden_size], )
-        self.parent.assertEqual(result.pooler_output.shape,
-                                [self.batch_size, self.hidden_size])
+            [self.batch_size, num_patches + 1, self.hidden_size],
+        )
+        self.parent.assertEqual(result.pooler_output.shape, [self.batch_size, self.hidden_size])
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -135,7 +138,7 @@ class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase):
     attention_mask and seq_length.
     """
 
-    all_model_classes = (VisionTransformer, )
+    all_model_classes = (VisionTransformer,)
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
@@ -148,7 +151,8 @@ def setUp(self):
             self,
             config_class=Blip2VisionConfig,
             has_text_modality=False,
-            hidden_size=37, )
+            hidden_size=37,
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -191,28 +195,29 @@ def test_model_from_pretrained(self):
 
 class BertLMHeadModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=12,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=768,
-            projection_dim=32,
-            num_hidden_layers=6,
-            num_attention_heads=4,
-            intermediate_size=37,
-            dropout=0.1,
-            attention_dropout=0.1,
-            max_position_embeddings=512,
-            initializer_range=0.02,
-            bos_token_id=0,
-            scope=None,
-            num_patches=257,
-            encoder_hidden_size=1408,
-            encoder_width=1408, ):
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=768,
+        projection_dim=32,
+        num_hidden_layers=6,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+        num_patches=257,
+        encoder_hidden_size=1408,
+        encoder_width=1408,
+    ):
         self.parent = parent
         self.batch_size = batch_size
         self.seq_length = seq_length
@@ -236,12 +241,9 @@ def __init__(
         self.encoder_width = encoder_width
 
     def prepare_config_and_inputs(self):
-        query_embeds = floats_tensor(
-            [self.batch_size, self.seq_length, self.hidden_size])
-        encoder_hidden_states = floats_tensor(
-            [self.batch_size, self.num_patches, self.encoder_hidden_size])
-        encoder_attention_mask = random_attention_mask(
-            [self.batch_size, self.num_patches])
+        query_embeds = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_hidden_states = floats_tensor([self.batch_size, self.num_patches, self.encoder_hidden_size])
+        encoder_attention_mask = random_attention_mask([self.batch_size, self.num_patches])
         config = self.get_config()
 
         return config, query_embeds, encoder_hidden_states, encoder_attention_mask
@@ -259,19 +261,21 @@ def get_config(self):
             max_position_embeddings=self.max_position_embeddings,
             initializer_range=self.initializer_range,
             bos_token_id=self.bos_token_id,
-            encoder_hidden_size=self.encoder_hidden_size, )
+            encoder_hidden_size=self.encoder_hidden_size,
+        )
 
-    def create_and_check_model(self, config, query_embeds,
-                               encoder_hidden_states, encoder_attention_mask):
+    def create_and_check_model(self, config, query_embeds, encoder_hidden_states, encoder_attention_mask):
         model = BertLMHeadModel(config=config, encoder_width=self.encoder_width)
         model.eval()
         result = model(
             query_embeds=query_embeds,
             encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask, )
+            encoder_attention_mask=encoder_attention_mask,
+        )
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            [self.batch_size, self.seq_length, self.hidden_size], )
+            [self.batch_size, self.seq_length, self.hidden_size],
+        )
 
         model = BertLMHeadModel(config=config)
         model.eval()
@@ -279,11 +283,13 @@ def create_and_check_model(self, config, query_embeds,
             result = model(
                 query_embeds,
                 encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask, )
+                encoder_attention_mask=encoder_attention_mask,
+            )
 
         self.parent.assertEqual(
             result.last_hidden_state.shape,
-            [self.batch_size, self.seq_length, self.hidden_size], )
+            [self.batch_size, self.seq_length, self.hidden_size],
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -291,7 +297,8 @@ def prepare_config_and_inputs_for_common(self):
             config,
             query_embeds,
             encoder_hidden_states,
-            encoder_attention_mask, ) = config_and_inputs
+            encoder_attention_mask,
+        ) = config_and_inputs
         inputs_dict = {
             "query_embeds": query_embeds,
             "encoder_hidden_states": encoder_hidden_states,
@@ -301,7 +308,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 class BertLMHeadModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BertLMHeadModel, )
+    all_model_classes = (BertLMHeadModel,)
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
@@ -314,7 +321,8 @@ def setUp(self):
             self,
             config_class=Blip2QFormerConfig,
             has_text_modality=False,
-            hidden_size=37, )
+            hidden_size=37,
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -337,28 +345,29 @@ def test_save_load(self):
 
 class Blip2TextModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=12,
-            seq_length=7,
-            is_training=True,
-            use_labels=False,
-            vocab_size=99,
-            hidden_size=16,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=4,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=20,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-            embed_dim=16,
-            num_labels=3,
-            word_embed_proj_dim=16,
-            type_sequence_label_size=2, ):
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        embed_dim=16,
+        num_labels=3,
+        word_embed_proj_dim=16,
+        type_sequence_label_size=2,
+    ):
         self.parent = parent
         self.batch_size = batch_size
         self.seq_length = seq_length
@@ -385,14 +394,12 @@ def __init__(
     def prepare_config_and_inputs(self):
         config = self.get_config()
 
-        input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], self.vocab_size,
-            dtype="int64").clip(3, )
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64").clip(
+            3,
+        )
         input_ids[:, -1] = self.eos_token_id  # Eos Token
 
-        attention_mask = input_ids.not_equal(
-            paddle.to_tensor(
-                [self.pad_token_id], dtype="int64")).cast("int64")
+        attention_mask = input_ids.not_equal(paddle.to_tensor([self.pad_token_id], dtype="int64")).cast("int64")
 
         return config, input_ids, attention_mask
 
@@ -411,18 +418,20 @@ def get_config(self):
             pad_token_id=self.pad_token_id,
             embed_dim=self.embed_dim,
             is_encoder_decoder=False,
-            word_embed_proj_dim=self.word_embed_proj_dim, )
+            word_embed_proj_dim=self.word_embed_proj_dim,
+        )
 
 
 class Blip2ModelTester:
     def __init__(
-            self,
-            parent,
-            vision_kwargs=None,
-            qformer_kwargs=None,
-            text_kwargs=None,
-            is_training=True,
-            num_query_tokens=10, ):
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+    ):
         if vision_kwargs is None:
             vision_kwargs = {}
         if qformer_kwargs is None:
@@ -431,10 +440,8 @@ def __init__(
             text_kwargs = {}
 
         self.parent = parent
-        self.vision_model_tester = Blip2VisionModelTester(parent,
-                                                          **vision_kwargs)
-        self.qformer_model_tester = BertLMHeadModelTester(parent,
-                                                          **qformer_kwargs)
+        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
+        self.qformer_model_tester = BertLMHeadModelTester(parent, **qformer_kwargs)
         self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
@@ -456,16 +463,15 @@ def get_config(self):
             vision_config=self.vision_model_tester.get_config(),
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
-            num_query_tokens=self.num_query_tokens, )
+            num_query_tokens=self.num_query_tokens,
+        )
 
     @unittest.skip(reason="BLIP-2's output needs to unified")
-    def create_and_check_for_conditional_generation(
-            self, config, input_ids, attention_mask, pixel_values):
+    def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
         model = Blip2ForConditionalGeneration(config)
         model.eval()
         with paddle.no_grad():
-            result = model(
-                pixel_values, input_ids, attention_mask, return_dict=True)
+            result = model(pixel_values, input_ids, attention_mask, return_dict=True)
 
         self.parent.assertEqual(
             result.logits.shape,
@@ -473,7 +479,8 @@ def create_and_check_for_conditional_generation(
                 self.vision_model_tester.batch_size,
                 self.text_model_tester.seq_length + self.num_query_tokens,
                 self.text_model_tester.vocab_size,
-            ], )
+            ],
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -481,7 +488,8 @@ def prepare_config_and_inputs_for_common(self):
             config,
             input_ids,
             attention_mask,
-            pixel_values, ) = config_and_inputs
+            pixel_values,
+        ) = config_and_inputs
         inputs_dict = {
             "pixel_values": pixel_values,
             "input_ids": input_ids,
@@ -491,7 +499,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 class Blip2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Blip2ForConditionalGeneration, )
+    all_model_classes = (Blip2ForConditionalGeneration,)
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -505,16 +513,14 @@ def setUp(self):
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_conditional_generation(
-            *config_and_inputs)
+        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
 
     def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_determinism(first, second):
             out_1 = first.numpy()
@@ -551,22 +557,19 @@ def test_forward_signature(self):
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
     def test_load_vision_qformer_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         # Save Blip2Config and check if we can load Blip2VisionConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
             vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(),
-                                 vision_config.to_dict())
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
 
         # Save Blip2Config and check if we can load Blip2QFormerConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
             qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.qformer_config.to_dict(),
-                                 qformer_config.to_dict())
+            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
 
     @slow
     def test_model_from_pretrained(self):
diff --git a/tests/models/test_configuration_common.py b/tests/models/test_configuration_common.py
index b014bbfe522ea..839941f706385 100644
--- a/tests/models/test_configuration_common.py
+++ b/tests/models/test_configuration_common.py
@@ -12,22 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import json
 import os
 import tempfile
-import unittest.mock as mock
-
-from paddlenlp.transformers.configuration_utils import PretrainedConfig
-from requests.exceptions import HTTPError
 
 
 class ConfigTester(object):
-    def __init__(self,
-                 parent,
-                 config_class=None,
-                 has_text_modality=True,
-                 **kwargs):
+    def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs):
         self.parent = parent
         self.config_class = config_class
         self.has_text_modality = has_text_modality
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 43ae283d9149d..226caf803f84a 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -18,9 +18,7 @@
 import os
 import random
 import shutil
-import subprocess
 import tempfile
-import time
 import unittest
 from typing import Optional, Tuple, Type
 
@@ -36,8 +34,7 @@
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if ("_range" in key or "_std" in key or "initializer_factor" in key or
-                "layer_scale" in key):
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
             setattr(configs_no_init, key, 1e-10)
     return configs_no_init
 
@@ -64,11 +61,8 @@ def floats_tensor(shape, scale=1.0):
     return scale * paddle.randn(shape, dtype="float32")
 
 
-def check_two_model_parameter(first_model: PretrainedModel,
-                              second_model: PretrainedModel):
-    assert (len(
-        set(first_model.state_dict().keys()) - set(second_model.state_dict()
-                                                   .keys())) == 0)
+def check_two_model_parameter(first_model: PretrainedModel, second_model: PretrainedModel):
+    assert len(set(first_model.state_dict().keys()) - set(second_model.state_dict().keys())) == 0
 
     # random choice the keys to compare
     key = random.choice(list(first_model.state_dict().keys()))
@@ -106,8 +100,7 @@ def _make_model_instance(self, config, model_class):
         return model_class(self.base_model_class(**config))
 
     def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_save_load(out1, out2):
             # make sure we don't have nans
@@ -123,16 +116,14 @@ def check_save_load(out1, out2):
             model = self._make_model_instance(config, model_class)
             model.eval()
             with paddle.no_grad():
-                first = model(**self._prepare_for_class(inputs_dict,
-                                                        model_class))[0]
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model = model_class.from_pretrained(tmpdirname)
                 model.eval()
                 with paddle.no_grad():
-                    second = model(**self._prepare_for_class(inputs_dict,
-                                                             model_class))[0]
+                    second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
 
             # support tuple of tensor
             if isinstance(first, tuple) and isinstance(second, tuple):
@@ -142,8 +133,7 @@ def check_save_load(out1, out2):
                 check_save_load(first, second)
 
     def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_determinism(first, second):
             out_1 = first.numpy()
@@ -157,10 +147,8 @@ def check_determinism(first, second):
             model = self._make_model_instance(config, model_class)
             model.eval()
             with paddle.no_grad():
-                first = model(**self._prepare_for_class(inputs_dict,
-                                                        model_class))[0]
-                second = model(**self._prepare_for_class(inputs_dict,
-                                                         model_class))[0]
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
 
             if isinstance(first, tuple) and isinstance(second, tuple):
                 for tensor1, tensor2 in zip(first, second):
@@ -190,30 +178,21 @@ def test_training_gradient_checkpointing(self):
     def test_attention_outputs(self):
         if not self.has_attentions:
             return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length",
-                                     seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length",
-                                     seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length",
-                                     decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length",
-                                     encoder_seq_length)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
         chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester,
-                                                "num_hashes"):
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
             encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
 
         for model_class in self.all_model_classes:
             signature = inspect.signature(model_class.forward)
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
-            if not all(
-                    name in arg_names
-                    for name in
-                ["output_attentions", "output_hidden_states", "return_dict"]):
+            if not all(name in arg_names for name in ["output_attentions", "output_hidden_states", "return_dict"]):
                 continue
             inputs_dict["output_attentions"] = True
             inputs_dict["output_hidden_states"] = False
@@ -221,12 +200,9 @@ def test_attention_outputs(self):
             model = self._make_model_instance(config, model_class)
             model.eval()
             with paddle.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict,
-                                                          model_class))
-            attentions = (outputs.encoder_attentions
-                          if self.is_encoder_decoder else outputs.attentions)
-            self.assertEqual(
-                len(attentions), self.model_tester.num_hidden_layers)
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
 
             # TODO(guosheng): check that output_attentions also work using config
 
@@ -238,7 +214,8 @@ def test_attention_outputs(self):
                         encoder_seq_length,
                         chunk_length,
                         encoder_key_length,
-                    ], )
+                    ],
+                )
             else:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
@@ -246,7 +223,8 @@ def test_attention_outputs(self):
                         self.model_tester.num_attention_heads,
                         encoder_seq_length,
                         encoder_key_length,
-                    ], )
+                    ],
+                )
             out_len = len(outputs)
 
             if self.is_encoder_decoder:
@@ -257,9 +235,7 @@ def test_attention_outputs(self):
                     correct_outlen += 1  # loss is added to beginning
                 # Question Answering model returns start_logits and end_logits
                 if model_class.__name__.endswith("ForQuestionAnswering"):
-                    correct_outlen += (
-                        1  # start_logits and end_logits instead of only 1 output
-                    )
+                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
 
@@ -268,29 +244,28 @@ def test_attention_outputs(self):
                 # decoder attentions
                 decoder_attentions = outputs.decoder_attentions
                 self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(
-                    len(decoder_attentions),
-                    self.model_tester.num_hidden_layers)
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(decoder_attentions[0].shape[-3:]),
                     [
                         self.model_tester.num_attention_heads,
                         decoder_seq_length,
                         decoder_key_length,
-                    ], )
+                    ],
+                )
 
                 # cross attentions
                 cross_attentions = outputs.cross_attentions
                 self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(
-                    len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
                     list(cross_attentions[0].shape[-3:]),
                     [
                         self.model_tester.num_attention_heads,
                         decoder_seq_length,
                         encoder_key_length,
-                    ], )
+                    ],
+                )
 
             # Check attention is always last and order is fine
             inputs_dict["output_attentions"] = True
@@ -298,8 +273,7 @@ def test_attention_outputs(self):
             model = self._make_model_instance(config, model_class)
             model.eval()
             with paddle.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict,
-                                                          model_class))
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
             if hasattr(self.model_tester, "num_hidden_states_types"):
                 added_hidden_states = self.model_tester.num_hidden_states_types
@@ -309,11 +283,9 @@ def test_attention_outputs(self):
                 added_hidden_states = 1
             self.assertEqual(out_len + added_hidden_states, len(outputs))
 
-            self_attentions = (outputs.encoder_attentions if
-                               self.is_encoder_decoder else outputs.attentions)
+            self_attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions
 
-            self.assertEqual(
-                len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
             if chunk_length is not None:
                 self.assertListEqual(
                     list(self_attentions[0].shape[-4:]),
@@ -322,7 +294,8 @@ def test_attention_outputs(self):
                         encoder_seq_length,
                         chunk_length,
                         encoder_key_length,
-                    ], )
+                    ],
+                )
             else:
                 self.assertListEqual(
                     list(self_attentions[0].shape[-3:]),
@@ -330,7 +303,8 @@ def test_attention_outputs(self):
                         self.model_tester.num_attention_heads,
                         encoder_seq_length,
                         encoder_key_length,
-                    ], )
+                    ],
+                )
 
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
@@ -338,29 +312,28 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             model.eval()
 
             with paddle.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict,
-                                                          model_class))
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            hidden_states = (outputs.encoder_hidden_states if
-                             self.is_encoder_decoder else outputs.hidden_states)
+            hidden_states = outputs.encoder_hidden_states if self.is_encoder_decoder else outputs.hidden_states
 
             expected_num_layers = getattr(
                 self.model_tester,
                 "expected_num_hidden_layers",
-                self.model_tester.num_hidden_layers + 1, )
+                self.model_tester.num_hidden_layers + 1,
+            )
             self.assertEqual(len(hidden_states), expected_num_layers)
 
             if hasattr(self.model_tester, "encoder_seq_length"):
                 seq_length = self.model_tester.encoder_seq_length
-                if (hasattr(self.model_tester, "chunk_length") and
-                        self.model_tester.chunk_length > 1):
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
                     seq_length = seq_length * self.model_tester.chunk_length
             else:
                 seq_length = self.model_tester.seq_length
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size], )
+                [seq_length, self.model_tester.hidden_size],
+            )
 
             if self.is_encoder_decoder:
                 hidden_states = outputs.decoder_hidden_states
@@ -368,24 +341,20 @@ def check_hidden_states_output(inputs_dict, config, model_class):
                 self.assertIsInstance(hidden_states, (list, tuple))
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester,
-                                             "decoder_seq_length", seq_len)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
 
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size], )
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         inputs_dict["return_dict"] = True
         for model_class in self.all_model_classes:
             signature = inspect.signature(model_class.forward)
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
-            if not all(
-                    name in arg_names
-                    for name in
-                ["output_attentions", "output_hidden_states", "return_dict"]):
+            if not all(name in arg_names for name in ["output_attentions", "output_hidden_states", "return_dict"]):
                 continue
             inputs_dict["output_hidden_states"] = True
             check_hidden_states_output(inputs_dict, config, model_class)
@@ -417,7 +386,8 @@ def test_resize_position_vector_embeddings(self):
             if self.is_encoder_decoder:
                 (
                     encoder_model_embed,
-                    decoder_model_embed, ) = model.get_position_embeddings()
+                    decoder_model_embed,
+                ) = model.get_position_embeddings()
                 encoder_cloned_embeddings = encoder_model_embed.weight.clone()
                 decoder_cloned_embeddings = decoder_model_embed.weight.clone()
             else:
@@ -427,24 +397,25 @@ def test_resize_position_vector_embeddings(self):
             # Check that resizing the position embeddings with a larger max_position_embeddings increases
             # the model's postion embeddings size
             model.resize_position_embeddings(max_position_embeddings + 10)
-            self.assertEqual(model.config.max_position_embeddings,
-                             max_position_embeddings + 10)
+            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
 
             # Check that it actually resizes the embeddings matrix
             if model.config.is_encoder_decoder:
                 (
                     encoder_model_embed,
-                    decoder_model_embed, ) = model.get_position_embeddings()
+                    decoder_model_embed,
+                ) = model.get_position_embeddings()
                 self.assertEqual(
                     encoder_model_embed.weight.shape[0],
-                    encoder_cloned_embeddings.shape[0] + 10, )
+                    encoder_cloned_embeddings.shape[0] + 10,
+                )
                 self.assertEqual(
                     decoder_model_embed.weight.shape[0],
-                    decoder_cloned_embeddings.shape[0] + 10, )
+                    decoder_cloned_embeddings.shape[0] + 10,
+                )
             else:
                 model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0],
-                                 cloned_embeddings.shape[0] + 10)
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
 
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
@@ -454,23 +425,26 @@ def test_resize_position_vector_embeddings(self):
             model.resize_position_embeddings(max_position_embeddings - 5)
             self.assertEqual(
                 model.base_model.config["max_position_embeddings"],
-                max_position_embeddings - 5, )
+                max_position_embeddings - 5,
+            )
 
             # Check that it actually resizes the embeddings matrix
             if self.is_encoder_decoder:
                 (
                     encoder_model_embed,
-                    decoder_model_embed, ) = model.get_position_embeddings()
+                    decoder_model_embed,
+                ) = model.get_position_embeddings()
                 self.assertEqual(
                     encoder_model_embed.weight.shape[0],
-                    encoder_cloned_embeddings.shape[0] - 5, )
+                    encoder_cloned_embeddings.shape[0] - 5,
+                )
                 self.assertEqual(
                     decoder_model_embed.weight.shape[0],
-                    decoder_cloned_embeddings.shape[0] - 5, )
+                    decoder_cloned_embeddings.shape[0] - 5,
+                )
             else:
                 model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0],
-                                 cloned_embeddings.shape[0] - 5)
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
 
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
@@ -479,12 +453,10 @@ def test_resize_position_vector_embeddings(self):
             models_equal = True
 
             if model.config.is_encoder_decoder:
-                for p1, p2 in zip(encoder_cloned_embeddings,
-                                  encoder_model_embed.weight):
+                for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight):
                     if p1.data.ne(p2.data).sum() > 0:
                         models_equal = False
-                for p1, p2 in zip(decoder_cloned_embeddings,
-                                  decoder_model_embed.weight):
+                for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight):
                     if p1.data.ne(p2.data).sum() > 0:
                         models_equal = False
             else:
@@ -515,32 +487,27 @@ def test_resize_tokens_embeddings(self):
 
             # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
             model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.base_model.config.vocab_size,
-                             model_vocab_size + 10)
+            self.assertEqual(model.base_model.config.vocab_size, model_vocab_size + 10)
             # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0],
-                             cloned_embeddings.shape[0] + 10)
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
             # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
             model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.base_model.config.vocab_size,
-                             model_vocab_size - 15)
+            self.assertEqual(model.base_model.config.vocab_size, model_vocab_size - 15)
             # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0],
-                             cloned_embeddings.shape[0] - 15)
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
 
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = paddle.clip(
-                inputs_dict["input_ids"], max=model_vocab_size - 15 - 1)
+            inputs_dict["input_ids"] = paddle.clip(inputs_dict["input_ids"], max=model_vocab_size - 15 - 1)
 
             # make sure that decoder_input_ids are resized as well
             if "decoder_input_ids" in inputs_dict:
                 inputs_dict["decoder_input_ids"] = paddle.clip(
-                    inputs_dict["decoder_input_ids"],
-                    max=model_vocab_size - 15 - 1)
+                    inputs_dict["decoder_input_ids"], max=model_vocab_size - 15 - 1
+                )
             model(**self._prepare_for_class(inputs_dict, model_class))
 
             # Check that adding and removing tokens has not modified the first part of the embedding matrix.
@@ -566,15 +533,13 @@ def test_inputs_embeds(self):
         if not self.use_test_inputs_embeds:
             return
         # get config for model and inputs_dict for model forward
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         # test all model classes
         for model_class in self.all_model_classes:
             model = self._make_model_instance(config, model_class)
             model.eval()
 
-            inputs = copy.deepcopy(
-                self._prepare_for_class(inputs_dict, model_class))
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
 
             with paddle.no_grad():
                 ids_output = model(**inputs)
@@ -584,8 +549,7 @@ def test_inputs_embeds(self):
                 del inputs["input_ids"]
             else:
                 encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids",
-                                               encoder_input_ids)
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
                 del inputs["input_ids"]
                 inputs.pop("decoder_input_ids", None)
 
@@ -616,8 +580,7 @@ def test_model_name_list(self):
         self.assertTrue(len(model.model_name_list) != 0)
 
     def test_pretrained_config_save_load(self):
-        if (self.base_model_class is None or
-                not self.base_model_class.constructed_from_pretrained_config()):
+        if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config():
             return
 
         config_class = self.base_model_class.config_class
@@ -627,23 +590,21 @@ def test_pretrained_config_save_load(self):
             config.save_pretrained(tempdir)
 
             # check the file exist
-            self.assertFalse(
-                os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME)))
+            self.assertFalse(os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME)))
             self.assertTrue(os.path.exists(os.path.join(tempdir, CONFIG_NAME)))
 
             # rename the CONFIG_NAME
             shutil.move(
                 os.path.join(tempdir, CONFIG_NAME),
-                os.path.join(tempdir, LEGACY_CONFIG_NAME), )
+                os.path.join(tempdir, LEGACY_CONFIG_NAME),
+            )
 
             loaded_config = config.__class__.from_pretrained(tempdir)
             for key in config.__dict__.keys():
-                self.assertEqual(
-                    getattr(config, key), getattr(loaded_config, key))
+                self.assertEqual(getattr(config, key), getattr(loaded_config, key))
 
     def random_choice_pretrained_config_field(self) -> Optional[str]:
-        if (self.base_model_class is None or
-                not self.base_model_class.constructed_from_pretrained_config()):
+        if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config():
             return None
 
         config = self.base_model_class.config_class()
@@ -652,21 +613,17 @@ def random_choice_pretrained_config_field(self) -> Optional[str]:
 
     def test_for_missed_attribute(self):
         if not self.test_model_compatibility_keys:
-            self.skipTest(
-                f"Do not test model_compatibility_keys on {self.base_model_class}"
-            )
+            self.skipTest(f"Do not test model_compatibility_keys on {self.base_model_class}")
             return
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             if not model_class.constructed_from_pretrained_config():
                 continue
 
             model = self._make_model_instance(config, model_class)
 
-            all_maps: dict = copy.deepcopy(
-                model_class.config_class.attribute_map)
+            all_maps: dict = copy.deepcopy(model_class.config_class.attribute_map)
 
             for old_attribute, new_attribute in all_maps.items():
                 old_value = getattr(model.config, old_attribute)
@@ -683,11 +640,9 @@ def test_tie_weight(self):
         if not self.test_tie_weights:
             return
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-        )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-            if ("CausalLM" not in model_class.__name__ and
-                    "MaskedLM" not in model_class.__name__):
+            if "CausalLM" not in model_class.__name__ and "MaskedLM" not in model_class.__name__:
                 continue
 
             model = self._make_model_instance(config, model_class)
@@ -695,8 +650,7 @@ def test_tie_weight(self):
             if not model.config.tie_word_embeddings:
                 continue
 
-            if hasattr(model, "get_input_embeddings") and hasattr(
-                    model, "get_output_embeddings"):
+            if hasattr(model, "get_input_embeddings") and hasattr(model, "get_output_embeddings"):
                 try:
                     input_embeddings = model.get_input_embeddings()
                 except NotImplementedError:
@@ -719,14 +673,16 @@ def test_tie_weight(self):
                         input_embeddings_weight = input_embeddings
                     print(
                         input_embeddings_weight,
-                        output_embeddings_weight, )
-                    print("model name :{},id is{},{}".format(
-                        model_class,
-                        id(output_embeddings_weight),
-                        id(input_embeddings_weight), ))
-                    self.assertEqual(
-                        id(output_embeddings_weight),
-                        id(input_embeddings_weight))
+                        output_embeddings_weight,
+                    )
+                    print(
+                        "model name :{},id is{},{}".format(
+                            model_class,
+                            id(output_embeddings_weight),
+                            id(input_embeddings_weight),
+                        )
+                    )
+                    self.assertEqual(id(output_embeddings_weight), id(input_embeddings_weight))
 
 
 class ModelTesterPretrainedMixin:
@@ -739,48 +695,42 @@ class ModelTesterPretrainedMixin:
     def test_model_from_pretrained_hf_hub(self):
         if self.hf_remote_test_model_path is None or self.base_model_class is None:
             return
-        model = self.base_model_class.from_pretrained(
-            self.hf_remote_test_model_path, from_hf_hub=True)
+        model = self.base_model_class.from_pretrained(self.hf_remote_test_model_path, from_hf_hub=True)
         self.assertIsNotNone(model)
 
     def test_model_from_pretrained_paddle_hub(self):
-        if (self.paddlehub_remote_test_model_path is None or
-                self.base_model_class is None):
+        if self.paddlehub_remote_test_model_path is None or self.base_model_class is None:
             return
-        model = self.base_model_class.from_pretrained(
-            self.paddlehub_remote_test_model_path)
+        model = self.base_model_class.from_pretrained(self.paddlehub_remote_test_model_path)
         self.assertIsNotNone(model)
 
     def test_model_from_config_paddle_hub(self):
-        if (self.paddlehub_remote_test_model_path is None or
-                self.base_model_class is None):
+        if self.paddlehub_remote_test_model_path is None or self.base_model_class is None:
             return
-        config = self.base_model_class.config_class.from_pretrained(
-            self.paddlehub_remote_test_model_path)
+        config = self.base_model_class.config_class.from_pretrained(self.paddlehub_remote_test_model_path)
         model = self.base_model_class._from_config(config)
         self.assertIsNotNone(model)
 
     @slow
     def test_model_from_pretrained_with_cache_dir(self):
-        for model_name in list(
-                self.base_model_class.pretrained_init_configuration)[:1]:
+        for model_name in list(self.base_model_class.pretrained_init_configuration)[:1]:
             with tempfile.TemporaryDirectory() as tempdir:
                 tempdir = str(tempdir)
 
-                model = self.base_model_class.from_pretrained(
-                    model_name, cache_dir=tempdir)
+                model = self.base_model_class.from_pretrained(model_name, cache_dir=tempdir)
                 self.assertIsNotNone(model)
                 self.assertTrue(
                     os.path.isfile(
                         os.path.join(
                             tempdir,
                             model_name,
-                            self.base_model_class.resource_files_names[
-                                "model_state"], )))
+                            self.base_model_class.resource_files_names["model_state"],
+                        )
+                    )
+                )
                 self.assertTrue(
-                    os.path.isfile(
-                        os.path.join(tempdir, model_name,
-                                     self.base_model_class.model_config_file)))
+                    os.path.isfile(os.path.join(tempdir, model_name, self.base_model_class.model_config_file))
+                )
 
     @slow
     def test_pretrained_save_and_load(self):
@@ -788,8 +738,7 @@ def test_pretrained_save_and_load(self):
 
         eg: `bert-base-uncased.pdparams` and `model_state.pdparams`
         """
-        for model_name in list(
-                self.base_model_class.pretrained_init_configuration)[:1]:
+        for model_name in list(self.base_model_class.pretrained_init_configuration)[:1]:
             model = self.base_model_class.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
@@ -798,8 +747,7 @@ def test_pretrained_save_and_load(self):
                 tempdirname = str(tempdir)
                 model.save_pretrained(tempdirname)
 
-                loaded_model = self.base_model_class.from_pretrained(
-                    tempdirname)
+                loaded_model = self.base_model_class.from_pretrained(tempdirname)
 
                 check_two_model_parameter(model, loaded_model)
 
@@ -809,20 +757,20 @@ def test_pretrained_save_and_load(self):
 
                 shutil.copytree(
                     os.path.join(MODEL_HOME, model_name),
-                    tempdirname, )
+                    tempdirname,
+                )
 
                 saved_model_state_file = os.path.join(
                     tempdirname,
-                    self.base_model_class.resource_files_names["model_state"], )
+                    self.base_model_class.resource_files_names["model_state"],
+                )
 
                 self.assertTrue(os.path.isfile(saved_model_state_file))
 
                 # rename it to the old style: name of url, eg: model_state.pdparams -> bert-base-uncased.pdparams
-                url = self.base_model_class.pretrained_resource_files_map[
-                    "model_state"][model_name]
+                url = self.base_model_class.pretrained_resource_files_map["model_state"][model_name]
                 pretrained_resource_file_name = os.path.split(url)[-1]
-                target_file_path = os.path.join(tempdirname,
-                                                pretrained_resource_file_name)
+                target_file_path = os.path.join(tempdirname, pretrained_resource_file_name)
 
                 shutil.copyfile(saved_model_state_file, target_file_path)
                 os.remove(saved_model_state_file)
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 27448810955d7..dbf16a00360e8 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -14,18 +14,10 @@
 
 from __future__ import annotations
 
-import copy
-import gc
-import inspect
 import os
-import sys
 import unittest
 from argparse import ArgumentTypeError
 
-import numpy as np
-import paddle
-import yaml
-
 
 def strtobool(v):
     if isinstance(v, bool):
@@ -47,9 +39,7 @@ def get_bool_from_env(key, default_value=False):
     try:
         value = strtobool(value)
     except ValueError:
-        raise ValueError(
-            f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive)."
-        )
+        raise ValueError(f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive).")
     return value